diff --git a/.bazelrc b/.bazelrc
index 5bac396c287384..657f6b86246aae 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -632,8 +632,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -719,12 +723,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -754,10 +760,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt
@@ -962,3 +970,6 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index bdce23b94d02f1..d670cd6040401d 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -123,7 +123,7 @@ jobs:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
         token: ${{ secrets.JENKINS_TOKEN }}
-        reviewers: angerson,mihaimaruseac,learning-to-play,nitins17
+        reviewers: mihaimaruseac,learning-to-play,nitins17
         body: |
           This PR was created by a GitHub Actions workflow to update all the SIG Build-based RBE containers to the most recent containers. See:
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b9b9f9304d142..89c61463462745 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,7 +71,7 @@ Before sending your pull requests, make sure you do the following:
 
 In a graphical form, the entire lifetime of a PR looks like
 
-![image](https://user-images.githubusercontent.com/323199/229561784-0a2f5509-b731-493f-ad88-bad487688c8d.png)
+![image](https://github.com/tensorflow/tensorflow/assets/52792999/3eea4ca5-daa0-4570-b0b5-2a2b03a724a3)
 
 ### Contributor License Agreements
 
diff --git a/RELEASE.md b/RELEASE.md
index 8089cf75521191..4ab21903418868 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -29,6 +29,8 @@
 * GPU
     * Support for NVIDIA GPUs with compute capability 8.9 (e.g. L4 & L40) has
       been added to TF binary distributions (Python wheels).
+* Replace `DebuggerOptions` of TensorFlow Quantizer, and migrate to
+  `DebuggerConfig` of StableHLO Quantizer.
 
 ## Keras
 
@@ -70,6 +72,12 @@
       schema globally in the converter and inference engine. The new behaviour
       can be disabled via experimental
       flag `converter._experimental_disable_per_channel_quantization_for_dense_layers = True`.
+    * C API:
+        * The experimental `TfLiteRegistrationExternal` type has been renamed as
+          `TfLiteOperator`, and likewise for the corresponding API functions.
+    * The Python TF Lite Interpreter bindings now have an option
+      `experimental_default_delegate_latest_features` to enable all default
+      delegate features.
 
 ## Thanks to our Contributors
 
diff --git a/WORKSPACE b/WORKSPACE
index a697405110e206..675a9481283514 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "org_tensorflow")
 
 # We must initialize hermetic python first.
@@ -23,7 +25,7 @@ load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
 
-load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow/tools/toolchains/python:python_repo.bzl",
     "python_repository",
diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index 96d87423392541..7db6569b3dc075 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -64,5 +64,6 @@ TFCI_PYTHON_VERSION=
 TFCI_WHL_AUDIT_ENABLE=
 TFCI_WHL_AUDIT_PLAT=
 TFCI_WHL_BAZEL_TEST_ENABLE=
+TFCI_WHL_IMPORT_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=
 TFCI_WHL_SIZE_LIMIT_ENABLE=
diff --git a/ci/official/envs/linux_x86_tpu b/ci/official/envs/linux_x86_tpu
index 3c7d61b2ac3794..8fa88ad7c85902 100644
--- a/ci/official/envs/linux_x86_tpu
+++ b/ci/official/envs/linux_x86_tpu
@@ -18,5 +18,6 @@ TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
 TFCI_BUILD_PIP_PACKAGE_ARGS="--repo_env=WHEEL_NAME=tensorflow_tpu"
 TFCI_LIB_SUFFIX="-tpu-linux-x86_64"
 TFCI_WHL_BAZEL_TEST_ENABLE=0
+TFCI_WHL_IMPORT_TEST_ENABLE=0
 TFCI_WHL_SIZE_LIMIT=580M
 TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-tf-releases/index.html"
diff --git a/ci/official/requirements_updater/BUILD.bazel b/ci/official/requirements_updater/BUILD.bazel
index 8cdb70597f0a83..06a0898d9a2b78 100644
--- a/ci/official/requirements_updater/BUILD.bazel
+++ b/ci/official/requirements_updater/BUILD.bazel
@@ -13,10 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-load("@python//3.9:defs.bzl", compile_pip_requirements_3_9 = "compile_pip_requirements")
 load("@python//3.10:defs.bzl", compile_pip_requirements_3_10 = "compile_pip_requirements")
 load("@python//3.11:defs.bzl", compile_pip_requirements_3_11 = "compile_pip_requirements")
 load("@python//3.12:defs.bzl", compile_pip_requirements_3_12 = "compile_pip_requirements")
+load("@python//3.9:defs.bzl", compile_pip_requirements_3_9 = "compile_pip_requirements")
 load("@updater_config_repository//:updater_config_repository.bzl", "REQUIREMENTS_FILE_NAME")
 
 compile_pip_requirements_3_9(
diff --git a/ci/official/requirements_updater/WORKSPACE b/ci/official/requirements_updater/WORKSPACE
index 9b56cc0422bf6d..f9a116a6a3153e 100644
--- a/ci/official/requirements_updater/WORKSPACE
+++ b/ci/official/requirements_updater/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "requirements_updater")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -22,7 +24,7 @@ load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()
 
-load("@rules_python//python:repositories.bzl", "python_register_multi_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_multi_toolchains")  # buildifier: disable=same-origin-load
 load("@rules_python//python/pip_install:repositories.bzl", "pip_install_dependencies")
 
 default_python_version = "3.10"
diff --git a/ci/official/utilities/rename_and_verify_wheels.sh b/ci/official/utilities/rename_and_verify_wheels.sh
index 4f4ea6745d5cb9..a79ce2a8868a3e 100755
--- a/ci/official/utilities/rename_and_verify_wheels.sh
+++ b/ci/official/utilities/rename_and_verify_wheels.sh
@@ -58,8 +58,10 @@ venv=$(mktemp -d)
 "python${TFCI_PYTHON_VERSION}" -m venv "$venv"
 python="$venv/bin/python3"
 "$python" -m pip install *.whl $TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS
-"$python" -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
-"$python" -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
+if [[ "$TFCI_WHL_IMPORT_TEST_ENABLE" == "1" ]]; then
+  "$python" -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+  "$python" -c 'import sys; import tensorflow as tf; sys.exit(0 if "keras" in tf.keras.__name__ else 1)'
+fi
 # VERY basic check to ensure the [and-cuda] package variant is installable.
 # Checks TFCI_BAZEL_COMMON_ARGS for "gpu" or "cuda", implying that the test is
 # relevant. All of the GPU test machines have CUDA installed via other means,
diff --git a/ci/official/wheel_test/WORKSPACE b/ci/official/wheel_test/WORKSPACE
index cef9033d30120f..d52a3ed895173b 100644
--- a/ci/official/wheel_test/WORKSPACE
+++ b/ci/official/wheel_test/WORKSPACE
@@ -1,3 +1,5 @@
+# buildifier: disable=load-on-top
+
 workspace(name = "wheel_test")
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -38,7 +40,7 @@ python_repository(name = "python_version_repo")
 load("@python_version_repo//:py_version.bzl", "TF_PYTHON_VERSION")
 
 # Register multi toolchains
-load("@rules_python//python:repositories.bzl", "python_register_toolchains")
+load("@rules_python//python:repositories.bzl", "python_register_toolchains")  # buildifier: disable=same-origin-load
 
 python_register_toolchains(
     name = "python",
diff --git a/configure.py b/configure.py
index c1cb20162012f6..66427431b42c16 100644
--- a/configure.py
+++ b/configure.py
@@ -759,7 +759,7 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
   android_ndk_api_level = prompt_loop_or_load_from_env(
       environ_cp,
       var_name='ANDROID_NDK_API_LEVEL',
-      var_default='26',  # 26 is required to support AHardwareBuffer.
+      var_default='21',  # 21 is required for ARM64 support.
       ask_for_var=(
           'Please specify the (min) Android NDK API level to use. '
           '[Available levels: %s]'
@@ -807,6 +807,18 @@ def choose_compiler(environ_cp):
   return var
 
 
+def choose_compiler_Win(environ_cp):
+  question = 'Do you want to use Clang to build TensorFlow?'
+  yes_reply = 'Add "--config=win_clang" to compile TensorFlow with CLANG.'
+  no_reply = 'MSVC will be used to compile TensorFlow.'
+  var = int(
+      get_var(
+          environ_cp, 'TF_NEED_CLANG', None, True, question, yes_reply, no_reply
+      )
+  )
+  return var
+
+
 def set_clang_compiler_path(environ_cp):
   """Set CLANG_COMPILER_PATH and environment variables.
 
@@ -848,6 +860,44 @@ def set_clang_compiler_path(environ_cp):
   return clang_compiler_path
 
 
+def set_clang_compiler_path_win(environ_cp):
+  """Set CLANG_COMPILER_PATH and environment variables.
+
+  Loop over user prompts for clang path until receiving a valid response.
+  Default is used if no input is given. Set CLANG_COMPILER_PATH and write
+  environment variables CC and BAZEL_COMPILER to .bazelrc.
+
+  Args:
+    environ_cp: (Dict) copy of the os.environ.
+
+  Returns:
+    string value for clang_compiler_path.
+  """
+  # Default path if clang-16 is installed by using apt-get install
+  default_clang_path = 'C:/Program Files/LLVM/bin/clang.exe'
+  if not os.path.exists(default_clang_path):
+    default_clang_path = which('clang') or ''
+
+  clang_compiler_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='CLANG_COMPILER_PATH',
+      var_default=default_clang_path,
+      ask_for_var='Please specify the path to clang executable.',
+      check_success=os.path.exists,
+      resolve_symlinks=True,
+      error_msg=(
+          'Invalid clang path. %s cannot be found. Note that Clang is now'
+          'preferred compiler. You may use MSVC by removing --config=win_clang'
+      ),
+  )
+
+  write_action_env_to_bazelrc('CLANG_COMPILER_PATH', clang_compiler_path)
+  write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path)
+  write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path)
+
+  return clang_compiler_path
+
+
 def retrieve_clang_version(clang_executable):
   """Retrieve installed clang version.
 
@@ -1386,8 +1436,9 @@ def main():
     else:
       raise UserInputError(
           'Invalid CUDA setting were provided %d '
-          'times in a row. Assuming to be a scripting mistake.' %
-          _DEFAULT_PROMPT_ASK_ATTEMPTS)
+          'times in a row. Assuming to be a scripting mistake.'
+          % _DEFAULT_PROMPT_ASK_ATTEMPTS
+      )
 
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
@@ -1415,6 +1466,12 @@ def main():
         clang_compiler_path = set_clang_compiler_path(environ_cp)
         clang_version = retrieve_clang_version(clang_compiler_path)
         disable_clang_offsetof_extension(clang_version)
+    if is_windows():
+      environ_cp['TF_NEED_CLANG'] = str(choose_compiler_Win(environ_cp))
+      if environ_cp.get('TF_NEED_CLANG') == '1':
+        clang_compiler_path = set_clang_compiler_path_win(environ_cp)
+        clang_version = retrieve_clang_version(clang_compiler_path)
+        disable_clang_offsetof_extension(clang_version)
 
   # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index eff34ce6a3f320..47c751c54b15a5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -2,6 +2,8 @@
 # C API for TensorFlow, for use by client language bindings.
 
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -18,8 +20,6 @@ load(
     "//tensorflow/core/tpu:build_defs.bzl",
     "if_libtpu_tf_status",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 647937fe98e47d..54afc6f757d740 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -56,9 +56,7 @@ cc_library(
     name = "parallel_device",
     srcs = [":device_sources"],
     hdrs = [":device_headers"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":parallel_device_lib",
         "//tensorflow/c:c_api",
diff --git a/tensorflow/c/experimental/filesystem/BUILD b/tensorflow/c/experimental/filesystem/BUILD
index e55d95334cf6cf..d25e6e9314f088 100644
--- a/tensorflow/c/experimental/filesystem/BUILD
+++ b/tensorflow/c/experimental/filesystem/BUILD
@@ -1,9 +1,8 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental filesystem C APIs for TensorFlow.
 # Will be moved in proper place once all filesystems are converted to the
 # modular framework.
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index bd2041b1d43957..7c23cb79143b01 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental gcs filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
index 90acb2bf389370..a4406b46945193 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental posix filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -25,6 +24,7 @@ cc_library(
     hdrs = ["posix_filesystem.h"],
     deps = [
         ":posix_filesystem_helper",
+        "//tensorflow/c:tf_file_statistics",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
     ],
@@ -40,6 +40,8 @@ cc_library(
         ":posix_filesystem_impl",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "//tensorflow/c/experimental/filesystem:modular_filesystem",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/log",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
index f1f3dda5e8ccc0..e3fbf03ea19440 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_helper.h"
+#include "tensorflow/c/tf_file_statistics.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for POSIX environments.
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
index 6081722e699e86..60205858499aed 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem_static.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/log/log.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/experimental/filesystem/modular_filesystem_registration.h"
 #include "tensorflow/c/experimental/filesystem/plugins/posix/posix_filesystem.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
index 2ac57f6a731344..159e36e485e6a6 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental windows filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 65f580deee93c4..a3fa49fffa34b7 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -1,14 +1,14 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
     "tf_cuda_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 # Library of gradient functions.
 package(
@@ -55,9 +55,7 @@ cc_library(
     hdrs = [
         "nn_grad.h",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:gradients_internal",
@@ -148,9 +146,7 @@ cc_library(
     testonly = True,
     srcs = ["grad_test_helper.cc"],
     hdrs = ["grad_test_helper.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c/eager:gradient_checker",
         "//tensorflow/c/eager:gradients_internal",
diff --git a/tensorflow/c/experimental/grappler/BUILD b/tensorflow/c/experimental/grappler/BUILD
index fd26096fd5d871..d4892b1b9b9624 100644
--- a/tensorflow/c/experimental/grappler/BUILD
+++ b/tensorflow/c/experimental/grappler/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Graph C API.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index 03c83a4e8f99e0..3d92b7ad3d2992 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -94,9 +94,9 @@ tf_cc_test(
         "@local_xla//xla:shape_util",
         "@local_xla//xla/pjrt:pjrt_api",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
-        "@local_xla//xla/pjrt:tfrt_cpu_pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_cpu",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_wrapper_impl",
+        "@local_xla//xla/pjrt/cpu:cpu_client",
     ],
 )
diff --git a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
index c72f0cfafa6ead..7f45fd91a1baea 100644
--- a/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/tensor_pjrt_buffer_util_test.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/c/pjrt_c_api_cpu.h"
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
-#include "xla/pjrt/tfrt_cpu_pjrt_client.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
index 7589ea2d2f24a2..c13bc899f2d016 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
@@ -20,13 +20,9 @@ cc_library(
     deps = [
         "//tensorflow/c/experimental/ops/gen/common",
         "//tensorflow/c/experimental/ops/gen/cpp/views",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 36c25c92760872..1fc16e093c011d 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
index 44f23ae0fb6aed..71132cfc3bf8b2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_file_renderer.h"
 
-#include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 8bfd5a334c565d..7a4275b532eda7 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h"
 
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
index cfe2a99acfddce..a45fe89a7a011c 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_GUARD_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 5242d6f1baf255..38f31209f6da24 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h"
 
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
index b98547079f3ac7..e43715a62e45b0 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_INCLUDE_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index 5547ca22df7ab0..db28ab303ae5c6 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h"
 
-#include "absl/strings/str_split.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
index a54fc5878a0ad4..fd8ccf9531ef51 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
index e5afb7b6d63393..5d11bcada6e8c0 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h"
 
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
index 1d85c4c9fd7940..9131cc945349af 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_COMMENT_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
index e2184fcc7f834f..804e0585f88cca 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/ops/gen/common/view_util.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/arg_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/attr_view.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
index 9237eb9410bad7..98c3b0d75524aa 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_OP_IMPLEMENTATION_RENDERER_H_
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
 
 namespace tensorflow {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index 41db2ced426b47..c58e67782dfc34 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -16,7 +16,15 @@ limitations under the License.
 
 #include <iterator>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_implementation_renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_argument_view.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
index c29fb35b5b6b7c..3360e14e672e3a 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_comment_renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/views/op_view.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index 0e6ee460512d2d..41d1dea64b3689 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
index b0a95baefa7676..b6168b196b35b2 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
 #define TENSORFLOW_C_EXPERIMENTAL_OPS_GEN_CPP_RENDERERS_RENDERER_H_
 
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index 2674e5f156d9d5..eff654c5938160 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include "tensorflow/c/experimental/ops/gen/common/path_config.h"
+#include "tensorflow/c/experimental/ops/gen/common/source_code.h"
+#include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace generator {
diff --git a/tensorflow/c/experimental/pluggable_profiler/BUILD b/tensorflow/c/experimental/pluggable_profiler/BUILD
index a34faa3146735b..49bb842e2e6258 100644
--- a/tensorflow/c/experimental/pluggable_profiler/BUILD
+++ b/tensorflow/c/experimental/pluggable_profiler/BUILD
@@ -1,8 +1,8 @@
 # Description:
 # Profiler C API
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index af37ab0cb19011..9ad56fcf6671b2 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Experimental SavedModel C APIs for TensorFlow. See RFC
 # https://github.com/tensorflow/community/pull/207
 # Targets in this directory are pure C++ "Classes" underlying the C API types
@@ -9,6 +7,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index cce725db3fcba1..3e9d28ed8795d4 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -1,11 +1,10 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # This package contains written convenience helpers for Eager Operations
 # used by SavedModel. Once we autogenerate C++ Eager Op wrappers, we can remove these.
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 36e5cb52d2ec25..244bbc9e515f19 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -9,8 +9,6 @@
 # Note(bmzhao): The *.cc files in this directory form the direct implementation of the
 # C API functions exposed in tf/c/experimental/saved_model/public/.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Note(bmzhao): All *type.h files in this directory are the internal definitions of
 # the opaque C types. These headers should only be visible to internal tensorflow
 # implementors.
@@ -19,6 +17,7 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -162,9 +161,7 @@ cc_library(
     hdrs = [
         "saved_model_api_type.h",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/c:conversion_macros",
         "//tensorflow/c/experimental/saved_model/core:saved_model_api",
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
index a10cfd03e3dc86..ec36b292a6518e 100644
--- a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 8e38201c2a5960..7bcaa66060665b 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_libs", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_libs", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a989862e4f79fb..43f63b2ba0cb81 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -2,7 +2,6 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_library_with_android_deps",
@@ -11,6 +10,7 @@ load(
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_gen_op_wrappers_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index e749d2433bd696..70184355fe76aa 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -1,7 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-
 # Tests for the C++ header-only base types.
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libexport/BUILD b/tensorflow/cc/experimental/libexport/BUILD
index 910ab930440f68..d206c115abea65 100644
--- a/tensorflow/cc/experimental/libexport/BUILD
+++ b/tensorflow/cc/experimental/libexport/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libtf/BUILD b/tensorflow/cc/experimental/libtf/BUILD
index 379f2e430aaacd..2c67800eacff63 100644
--- a/tensorflow/cc/experimental/libtf/BUILD
+++ b/tensorflow/cc/experimental/libtf/BUILD
@@ -1,16 +1,16 @@
 #include "third_party/absl/strings/str_cat.h"
 #TODO(aselle) : describe this package.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/experimental/libtf/impl/BUILD b/tensorflow/cc/experimental/libtf/impl/BUILD
index 0eae5a1f05c133..4f5b7ccfd84940 100644
--- a/tensorflow/cc/experimental/libtf/impl/BUILD
+++ b/tensorflow/cc/experimental/libtf/impl/BUILD
@@ -1,13 +1,13 @@
 # libtf implementation details.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/cc/framework/fuzzing/BUILD b/tensorflow/cc/framework/fuzzing/BUILD
index ec424fc0425630..74a946c283777d 100644
--- a/tensorflow/cc/framework/fuzzing/BUILD
+++ b/tensorflow/cc/framework/fuzzing/BUILD
@@ -1,11 +1,11 @@
 # TODO(unda): describe this package.
 
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
 load(
     "//tensorflow/cc/framework/fuzzing:op_fuzzing.bzl",
     "tf_gen_op_wrappers_fuzz",
 )
-load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a7a0af29268459..6cc731e722d16b 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -501,6 +501,7 @@ cc_library(
         "//tensorflow/core/graph/regularization:util",
         "//tensorflow/core/util/tensor_bundle:naming",
         "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index a98980d3c2760a..cf2ae4721623fa 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/btree_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -68,6 +69,7 @@ uint64_t HashCheckpointIndexFile(absl::string_view model_dir) {
   if (read_status.ok()) {
     return tensorflow::Fingerprint64(data);
   } else {
+    LOG(WARNING) << "Failed to read checkpoint file: " << read_status;
     return 0;
   }
 }
@@ -209,8 +211,7 @@ absl::StatusOr<FingerprintDef> ReadSavedModelFingerprint(
     absl::string_view export_dir) {
   const std::string fingerprint_pb_path =
       io::JoinPath(export_dir, kFingerprintFilenamePb);
-  absl::Status found_pb = Env::Default()->FileExists(fingerprint_pb_path);
-  if (!found_pb.ok()) return found_pb;
+  TF_RETURN_IF_ERROR(Env::Default()->FileExists(fingerprint_pb_path));
 
   FingerprintDef fingerprint_proto;
   absl::Status result =
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index ae63fdab2fa32c..18fd6655fd269d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/cc/saved_model/loader_util.h"
@@ -280,6 +282,16 @@ Status LoadMetagraphIntoSession(const SessionOptions& session_options,
   return (*session)->Create(meta_graph.graph_def());
 }
 
+Status LoadGraphDefIntoSession(const SessionOptions& session_options,
+                               GraphDef graph_def,
+                               std::unique_ptr<Session>* session) {
+  Session* session_p = nullptr;
+  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
+  session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(graph_def));
+  return (*session)->Create(std::move(graph_def));
+}
+
 Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const RunOptions& run_options,
                               const string& export_dir,
@@ -296,40 +308,6 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   return absl::OkStatus();
 }
 
-Status LoadSavedModel(const SessionOptions& session_options,
-                      const RunOptions& run_options, const string& export_dir,
-                      const std::unordered_set<string>& tags,
-                      SavedModelBundle* const bundle) {
-  metrics::SavedModelReadApi(kCCLoadLabel).IncrementBy(1);
-  auto fingerprint_proto =
-      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
-  if (fingerprint_proto.ok()) {
-    // Set gauge cell with saved_model_checksum.
-    metrics::SavedModelReadFingerprint().Set(
-        std::to_string(fingerprint_proto->saved_model_checksum()));
-  }
-
-  // TODO(robson): Add tests for the counters.
-  const uint64 start_microseconds = Env::Default()->NowMicros();
-  const Status status = LoadSavedModelInternal(session_options, run_options,
-                                               export_dir, tags, bundle);
-  auto log_and_count = [&](const string& status_str) {
-    LOG(INFO) << "SavedModel load for tags { " << absl::StrJoin(tags, " ")
-              << " }; Status: " << status_str << ": " << status << ". Took "
-              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
-    load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
-  };
-  if (status.ok()) {
-    log_and_count(kLoadAttemptSuccess);
-    metrics::SavedModelReadPath().Set(export_dir);
-  } else {
-    log_and_count(kLoadAttemptFail);
-  }
-  load_latency->GetCell(export_dir)
-      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
-  return status;
-}
-
 namespace {
 // Session wrapper that prevents calls to Session::Create(), Session::Extend(),
 // and the deprecated partial-run methods.
@@ -441,6 +419,70 @@ class LiteSessionWrapper : public Session {
 };
 }  // namespace
 
+Status LoadSavedModelInternal(const SessionOptions& session_options,
+                              const RunOptions& run_options,
+                              const string& export_dir,
+                              const std::unordered_set<string>& tags,
+                              SavedModelBundleLite* const bundle) {
+  MetaGraphDef meta_graph_def;
+  TF_RETURN_IF_ERROR(
+      ReadMetaGraphDefFromSavedModel(export_dir, tags, &meta_graph_def));
+  std::unique_ptr<Session> session;
+  TF_RETURN_IF_ERROR(LoadGraphDefIntoSession(
+      session_options, std::move(*meta_graph_def.mutable_graph_def()),
+      &session));
+  TF_RETURN_IF_ERROR(
+      RestoreSession(run_options, meta_graph_def, export_dir, &session));
+  *bundle = SavedModelBundleLite(
+      std::make_unique<LiteSessionWrapper>(std::move(session)),
+      std::move(*meta_graph_def.mutable_signature_def()));
+  return absl::OkStatus();
+}
+
+template <typename BundleType>
+Status LoadSavedModelGeneric(const SessionOptions& session_options,
+                             const RunOptions& run_options,
+                             const string& export_dir,
+                             const std::unordered_set<string>& tags,
+                             BundleType* const bundle) {
+  metrics::SavedModelReadApi(kCCLoadLabel).IncrementBy(1);
+  auto fingerprint_proto =
+      saved_model::fingerprinting::ReadSavedModelFingerprint(export_dir);
+  if (fingerprint_proto.ok()) {
+    // Set gauge cell with saved_model_checksum.
+    metrics::SavedModelReadFingerprint().Set(
+        std::to_string(fingerprint_proto->saved_model_checksum()));
+  }
+
+  // TODO(robson): Add tests for the counters.
+  const uint64 start_microseconds = Env::Default()->NowMicros();
+  const Status status = LoadSavedModelInternal(session_options, run_options,
+                                               export_dir, tags, bundle);
+  auto log_and_count = [&](const string& status_str) {
+    LOG(INFO) << "SavedModel load for tags { " << absl::StrJoin(tags, " ")
+              << " }; Status: " << status_str << ": " << status << ". Took "
+              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
+    load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
+  };
+  if (status.ok()) {
+    log_and_count(kLoadAttemptSuccess);
+    metrics::SavedModelReadPath().Set(export_dir);
+  } else {
+    log_and_count(kLoadAttemptFail);
+  }
+  load_latency->GetCell(export_dir)
+      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
+  return status;
+}
+
+Status LoadSavedModel(const SessionOptions& session_options,
+                      const RunOptions& run_options, const string& export_dir,
+                      const std::unordered_set<string>& tags,
+                      SavedModelBundle* const bundle) {
+  return LoadSavedModelGeneric<SavedModelBundle>(session_options, run_options,
+                                                 export_dir, tags, bundle);
+}
+
 Status RestoreSession(const RunOptions& run_options,
                       const MetaGraphDef& meta_graph, const string& export_dir,
                       std::unique_ptr<Session>* session) {
@@ -476,7 +518,6 @@ Status LoadSavedModel(const SessionOptions& session_options,
                       const RunOptions& run_options, const string& export_dir,
                       const std::unordered_set<string>& tags,
                       SavedModelBundleLite* const bundle) {
-  SavedModelBundle legacy_bundle;
   SessionOptions rewritten_options(session_options);
   // We disallow calls to Session::Extend() on the returned session, so we can
   // reduce memory consumption by not storing the original GraphDef.
@@ -489,11 +530,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
       ->set_disable_output_partition_graphs(true);
   // TODO(mrry): Consider specializing the session creation to reduce peak
   // RAM consumption by using `Session::Create(GraphDef&&)`.
-  TF_RETURN_IF_ERROR(LoadSavedModel(rewritten_options, run_options, export_dir,
-                                    tags, &legacy_bundle));
-  *bundle = SavedModelBundleLite(
-      std::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
-      std::move(*legacy_bundle.meta_graph_def.mutable_signature_def()));
+  TF_RETURN_IF_ERROR(LoadSavedModelGeneric(rewritten_options, run_options,
+                                           export_dir, tags, bundle));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index bb5daa99742944..10601308ac7d0f 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -2,11 +2,11 @@
 #Description:
 # TensorFlow cc tools.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 92d62b34be8bf9..dfedc5a4f8c6c0 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,8 +1,8 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "genrule")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 8ebeae499bd177..76f3c147903748 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,6 +1,6 @@
+load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.bzl", "if_libtpu", "if_with_tpu_support", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_cuda_only_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_custom_op_py_strict_library")
-load("@local_xla//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -520,10 +520,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:status_macros",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
@@ -585,6 +582,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:dma_helper",
+        "//tensorflow/core/common_runtime/gpu:gpu_serving_device_selector",
         "//tensorflow/core/tfrt/common:async_value_tensor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
@@ -592,6 +590,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:device_id_utils",
+        "@local_tsl//tsl/framework:serving_device_selector_policies",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla/client:local_client",
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index 66d9960ae0a62f..6372b2e5516cd3 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index e9880013bf2611..0c3c4986e44a88 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 licenses(["notice"])
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3f0a4847c54540..f9657509623cc1 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op.h"
@@ -58,6 +59,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #include "tsl/framework/device_id_utils.h"
+#include "tsl/framework/serving_device_selector_policies.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 
@@ -863,12 +865,35 @@ Status RunPjRtExecutable(
   TF_ASSIGN_OR_RETURN(xla::PjRtDevice * device,
                       pjrt_client->LookupAddressableDevice(pjrt_device_id));
 
+  gpu::GpuServingDeviceSelectorResource* device_selector_resource = nullptr;
+  if (device_type == DEVICE_GPU) {
+    auto rm = ctx->resource_manager();
+    TF_RETURN_IF_ERROR(rm->LookupOrCreate<
+                       gpu::GpuServingDeviceSelectorResource>(
+        rm->default_container(), gpu::kGpuServingDeviceSelectorResourceName,
+        &device_selector_resource,
+        [&](gpu::GpuServingDeviceSelectorResource** device_selector_resource) {
+          *device_selector_resource = new gpu::GpuServingDeviceSelectorResource(
+              pjrt_client->addressable_device_count(),
+              std::make_unique<tsl::RoundRobinPolicy>());
+          return absl::OkStatus();
+        }));
+    core::ScopedUnref device_selector_resource_ref(device_selector_resource);
+
+    TF_ASSIGN_OR_RETURN(absl::string_view fingerprint,
+                        executable->FingerprintExecutable());
+    device_selector_resource->selector()->Enqueue(pjrt_device_id, fingerprint);
+  }
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<xla::PjRtBuffer>> execute_outputs,
       RunPjRtExecutable(num_missing_prefix_ctx_inputs, inputs,
                         variable_snapshots, updated_variables, device_type,
                         use_pjrt_tensor_buffer, compilation_result, device,
                         pjrt_client, executable));
+  if (device_selector_resource != nullptr) {
+    device_selector_resource->selector()->Completed(pjrt_device_id,
+                                                    /*had_error=*/false);
+  }
 
   TF_RETURN_IF_ERROR(PopulateCtxOutputsFromPjRtExecutableOutputs(
       num_missing_prefix_ctx_inputs, inputs, updated_variables,
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index b30f08a1bfe1b4..d0286e5acff9ce 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",  # buildcleaner:keep
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:runtime_passes",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:mlir_to_graph_passes",
@@ -69,6 +70,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tosa:tfl_passes",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir/framework/ir:xla_framework",
         "@local_xla//xla/mlir/framework/transforms:passes",
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 95b9e92fa4c97f..c3826f1bfb935c 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -349,7 +349,6 @@ cc_library(
         "transforms/passes.h",
         "utils/attribute_utils.h",
         "utils/utils.h",
-        "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
         ":converter_inc",
@@ -382,8 +381,10 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
@@ -703,6 +704,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -738,11 +740,14 @@ cc_library(
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:status",
         "@local_xla//xla:statusor",
         "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -854,12 +859,16 @@ cc_library(
     ],
     deps = [
         "convert_type",
+        ":op_quant_spec_getters_inc",
         ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
+        ":tensorflow_lite_post_quantize_inc_gen",
+        ":tensorflow_lite_quantize_inc_gen",
         ":validators",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/lite/quantization/lite:tfl_to_std",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -878,6 +887,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla/mlir_hlo",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -1146,6 +1156,7 @@ cc_library(
         ":size_utils",
         ":tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -1373,6 +1384,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_quantization_passes",
         "//tensorflow/compiler/mlir/lite/stablehlo:compose_uniform_quantized_type_pass",
+        "//tensorflow/compiler/mlir/lite/stablehlo:composite_lowering",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_legalize_hlo",  # buildcleaner: keep
@@ -1411,9 +1423,9 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/metrics:error_collector",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization/stablehlo:quantization",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_composite_to_tfl_custom",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_stablehlo_to_vhlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
-        "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index dd4f59ebe3a889..69ec0bbbcee3dc 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -99,6 +99,9 @@ struct PassConfig {
   // When set to true, StableHLO Quantizer is run. The full configuration for
   // the quantizer is at `TocoFlags::quantization_config`.
   bool enable_stablehlo_quantizer = false;
+
+  // Enables the attempt to directly lower composites into tflite ops.
+  bool enable_composite_direct_lowering = false;
 };
 
 inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 21bf8f739aea78..248a55c7fe17e1 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load(
     "@llvm-project//mlir:tblgen.bzl",
     "gentbl_cc_library",
 )
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
@@ -82,9 +82,12 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
@@ -197,6 +200,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -215,6 +219,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/experimental/tac/hardwares:target_hardware",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -259,10 +264,13 @@ cc_library(
         "@com_google_protobuf//:protobuf_headers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
index 57fb5ea9eef10d..c5707a5f888885 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/examples/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index 7ee0b43b84d98c..5b3f2836feec99 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 load("//tensorflow:tensorflow.bzl", "VERSION")
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 1a9ff8016649ef..b98d3220ee15a8 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -181,6 +181,8 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
     return tflite::TensorType_FLOAT32;
   } else if (type.isF16()) {
     return tflite::TensorType_FLOAT16;
+  } else if (type.isBF16()) {
+    return tflite::TensorType_BFLOAT16;
   } else if (type.isF64()) {
     return tflite::TensorType_FLOAT64;
   } else if (type.isa<mlir::TF::StringType>()) {
@@ -577,9 +579,6 @@ class Translator {
         module.getContext()->getOrLoadDialect<mlir::TF::TensorFlowDialect>();
     tfl_dialect_ = module.getContext()
                        ->getOrLoadDialect<mlir::TFL::TensorFlowLiteDialect>();
-    stablehlo_dialect_ =
-        module.getContext()
-            ->getOrLoadDialect<mlir::stablehlo::StablehloDialect>();
     vhlo_dialect_ =
         module.getContext()->getOrLoadDialect<mlir::vhlo::VhloDialect>();
     // Right now the TF executor dialect is still needed to build NodeDef.
@@ -834,7 +833,6 @@ class Translator {
   // dialect is not registered.
   const Dialect* tf_dialect_;
   const Dialect* tfl_dialect_;
-  const Dialect* stablehlo_dialect_;
   const Dialect* vhlo_dialect_;
 
   // The failed ops during legalization.
@@ -1996,35 +1994,6 @@ std::optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
     return offset;
   }
 
-  // EXPERIMENTAL: If the source is in stablehlo dialect, also create them as
-  // builtin ops
-  if (dialect == stablehlo_dialect_) {
-    // for stablehlo ops with kernels, we directly serialize them whenever
-    // possible
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ScatterOp>(inst)) {
-      return BuildStablehloScatterOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op =
-            llvm::dyn_cast<mlir::stablehlo::RngBitGeneratorOp>(inst)) {
-      return BuildStablehloRngBitGeneratorOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::GatherOp>(inst)) {
-      return BuildStablehloGatherOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(inst)) {
-      return BuildStablehloReduceWindowOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::PadOp>(inst)) {
-      return BuildStablehloPadOp(shlo_op, operands, results);
-    }
-    if (auto shlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(inst)) {
-      return BuildStablehloOperatorwithoutOptions(
-          shlo_op, operands, results, tflite::BuiltinOperator_STABLEHLO_ADD);
-    }
-    return inst->emitOpError("is not part of the stablehlo support yet."),
-           std::nullopt;
-  }
-
   if (dialect == vhlo_dialect_) {
     mlir::VhloToStablehloTypeConverter vhlo_type_converter;
     if (auto vhlo_op = llvm::dyn_cast<mlir::vhlo::ScatterOpV1>(inst)) {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 55388c86dfc7bf..481f5573058b8c 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -3926,10 +3926,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F16, BF16, F32, F64, I1, TFL_I4, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F16, BF16, F32, F64, I1, I16, UI16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
diff --git a/tensorflow/compiler/mlir/lite/metrics/BUILD b/tensorflow/compiler/mlir/lite/metrics/BUILD
index dfdb63ce59ef5c..6218a2fb30a829 100644
--- a/tensorflow/compiler/mlir/lite/metrics/BUILD
+++ b/tensorflow/compiler/mlir/lite/metrics/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -33,6 +33,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -43,11 +44,15 @@ tf_cc_test(
         "testdata/strided_slice.mlir",
     ],
     deps = [
+        ":error_collector",
         ":error_collector_inst",
         ":types_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
index 9a6c173f8c4f9d..6e31d8cb21f29a 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.cc
@@ -21,7 +21,11 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
+#include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
index 322ec2e852d8cc..b5d66c622ab389 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
index ee433b0ded933c..f7d20783b6ea81 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
@@ -23,19 +23,29 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "llvm/Support/MemoryBuffer.h"
+#include "absl/status/statusor.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
 #include "tensorflow/compiler/mlir/lite/metrics/types_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/metrics/types_util.cc b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
index 96a167b3254ba6..b47347ceb03827 100644
--- a/tensorflow/compiler/mlir/lite/metrics/types_util.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/types_util.cc
@@ -16,8 +16,11 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
 
 namespace mlir {
 namespace TFL {
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 8cf08dea534d5c..3e50192fa0640d 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -204,6 +204,8 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   pass_config.legalize_custom_tensor_list_ops =
       toco_flags.legalize_custom_tensor_list_ops();
   pass_config.enable_stablehlo_quantizer = toco_flags.has_quantization_config();
+  pass_config.enable_composite_direct_lowering =
+      toco_flags.enable_composite_direct_lowering();
 
   if (toco_flags.qdq_conversion_mode() == "STATIC") {
     pass_config.quant_specs.qdq_conversion_mode =
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
index 727fb03d833964..a6d6c61444548e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -77,24 +77,22 @@ cc_library(
     srcs = [
         "ConvertConst.cc",
         "ConvertSimQuant.cc",
-        "FakeQuantSupport.cc",
         "QuantOps.cc",
         "QuantizeUtils.cc",
-        "UniformSupport.cc",
     ],
     hdrs = [
-        "FakeQuantSupport.h",
         "Passes.h",
         "QuantOps.h",
         "QuantizeUtils.h",
-        "UniformSupport.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":QuantOpsIncGen",
         ":QuantPassIncGen",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
index ae9b67e9e60af6..3de159a1414429 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertConst.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
index f64c400d4fb155..e99addc5b5f8a5 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/ConvertSimQuant.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/Passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
index 67c1c7d9284f2b..919c711272b2c1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 using namespace mlir;
 using namespace mlir::quantfork;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index ad7c1905440297..66df4f528aa43d 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -88,9 +88,11 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
index df286611f3e356..f96d4961e733b4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -18,12 +18,14 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_stablehlo",
         "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
index ccba41d07e103b..08f5ecd4851b7e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
@@ -41,6 +43,7 @@ namespace tensorflow {
 namespace {
 
 using ::mlir::quant::stablehlo::StaticRangePtqComponent;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::PyFunctionLibrary;
@@ -79,7 +82,7 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
     const SavedModelBundle* saved_model_bundle,
     const absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& saved_model_tags,
-    QuantizationConfig& quantization_config,
+    const QuantizationConfig& quantization_config,
     const PyFunctionLibrary* quantization_py_function_lib,
     mlir::ModuleOp module_op) {
   if (saved_model_bundle == nullptr) {
@@ -94,10 +97,11 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
         "be nullptr.");
   }
 
-  if (!quantization_config.has_calibration_options()) {
-    *quantization_config.mutable_calibration_options() =
-        mlir::quant::stablehlo::GetDefaultCalibrationOptions();
-  }
+  LOG(INFO) << "User-provided quantization config: "
+            << quantization_config.DebugString();
+  const QuantizationConfig updated_config =
+      ExpandPresets(PopulateDefaults(quantization_config));
+  LOG(INFO) << "Updated quantization config: " << updated_config.DebugString();
 
   const absl::flat_hash_map<std::string, SignatureDef> signature_def_map =
       GetSignatureDefMapFromBundle(*saved_model_bundle);
@@ -131,8 +135,9 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
       module_op.getContext(), quantization_py_function_lib, saved_model_dir,
       /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
       GetFunctionAliases(*saved_model_bundle));
-  const absl::StatusOr<mlir::ModuleOp> quantized_module_op =
-      static_range_ptq_component.Run(module_op, quantization_config);
+
+  absl::StatusOr<mlir::ModuleOp> quantized_module_op =
+      static_range_ptq_component.Run(module_op, updated_config);
   if (!quantized_module_op.ok()) {
     return absl::InternalError("Failed to run quantization. Status msg: " +
                                quantized_module_op.status().ToString());
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
index ef6496315e8e61..c55d59cad0f1a0 100644
--- a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
@@ -50,7 +50,7 @@ absl::StatusOr<mlir::ModuleOp> RunQuantization(
     const SavedModelBundle* saved_model_bundle,
     absl::string_view saved_model_dir,
     const std::unordered_set<std::string>& saved_model_tags,
-    stablehlo::quantization::QuantizationConfig& quantization_config,
+    const stablehlo::quantization::QuantizationConfig& quantization_config,
     const tensorflow::quantization::PyFunctionLibrary*
         quantization_py_function_lib,
     mlir::ModuleOp module_op);
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index 4f2e681a986f65..fce754995766d5 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index 92b6d2c9abb7b3..bd83f16de105f8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -33,30 +33,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "stablehlo_tfl",
-    srcs = [
-        "transforms/stablehlo_tfl_pass.cc",
-    ],
-    hdrs = [
-        "transforms/stablehlo_tfl_pass.h",
-    ],
-    copts = [
-        "-Ithird_party",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "@flatbuffers",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
-        "@stablehlo//:stablehlo_ops",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "stablehlo_util",
     srcs = [
@@ -110,6 +86,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -133,6 +110,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -162,10 +140,12 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:type_conversion",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:register",
     ],
@@ -213,6 +193,7 @@ cc_library(
         ":drop_savedmodel_semantics",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":rename_entrypoint_to_main",
@@ -337,6 +318,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -361,6 +343,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -389,6 +372,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -441,6 +425,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_serialization",
@@ -449,6 +434,32 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_stablehlo_composite_to_tfl_custom",
+    srcs = [
+        "transforms/legalize_stablehlo_composite_to_tfl_custom.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/passes.h.inc",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@flatbuffers",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "legalize_stablehlo_to_vhlo_pass",
     srcs = [
@@ -492,6 +503,31 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "legalize_stablehlo_custom_call_to_composite",
+    srcs = [
+        "transforms/legalize_stablehlo_custom_call_to_composite.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+        "transforms/passes.h.inc",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:stablehlo_ops",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "optimize",
     srcs = [
@@ -509,6 +545,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
     ],
@@ -648,6 +685,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@stablehlo//:broadcast_utils",
@@ -655,6 +693,50 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "composite_lowering",
+    srcs = [
+        "transforms/composite_lowering_pass.cc",
+    ],
+    hdrs = [
+        "transforms/passes.h",
+    ],
+    copts = [
+        "-Ithird_party",
+    ],
+    deps = [
+        ":composite_lowering_inc_gen",
+        ":passes_inc_gen",
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@local_xla//xla/mlir_hlo",
+    ],
+    alwayslink = True,
+)
+
+gentbl_cc_library(
+    name = "composite_lowering_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            ["-gen-rewriters"],
+            "transforms/generated_composite_lowering.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "transforms/composite_lowering_patterns.td",
+    deps = [
+        "//tensorflow/compiler/mlir/lite:tensorflow_lite_ops_td_files",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncTdFiles",
+        "@local_xla//xla/mlir_hlo:hlo_ops_td_files",
+    ],
+)
+
 tf_cc_binary(
     name = "odml_to_stablehlo",
     srcs = [
@@ -667,7 +749,6 @@ tf_cc_binary(
     deps = [
         ":check_accepted_ops_pass",
         ":op_stat_pass",
-        ":stablehlo_tfl",
         ":stablehlo_util",
         ":transforms",
         "//tensorflow/cc/saved_model:loader",
@@ -675,7 +756,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir/lite:flatbuffer_export",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/lite/stablehlo/serializer:flatbuffer_export",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
@@ -709,11 +789,12 @@ tf_cc_binary(
         ":compose_uniform_quantized_type_pass",
         ":fold_broadcast_pass",
         ":fuse_convolution_pass",
+        ":legalize_stablehlo_composite_to_tfl_custom",
+        ":legalize_stablehlo_custom_call_to_composite",
         ":legalize_stablehlo_to_vhlo_pass",
         ":legalize_tf_xla_call_module_to_stablehlo_pass",
         ":optimize",
         ":passes_inc_gen",
-        ":stablehlo_tfl",
         ":tf_legalize_hlo",
         ":tf_stablehlo",
         ":tfl_legalize_hlo",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index dfcb9de5cc717a..f1d6b237ac2ef6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -50,10 +50,8 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/check_accepted_ops_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
@@ -191,17 +189,6 @@ tensorflow::StatusOr<OwningOpRef<mlir::ModuleOp>> ImportSavedModelOrMLIR(
                           saved_model_bundle);
 }
 
-tensorflow::Status ConvertStableHLOToFlatbuffer(mlir::ModuleOp module,
-                                                std::string* flatbuffer_str) {
-  mlir::odml::FlatbufferExportOptions options;
-  if (!mlir::odml::MlirToFlatBufferTranslateFunction(module, options,
-                                                     flatbuffer_str)) {
-    return tensorflow::errors::Aborted("Unable to export flatbuffer");
-  }
-
-  return ::tensorflow::OkStatus();
-}
-
 tensorflow::Status ExportModule(mlir::ModuleOp module,
                                 const std::string& output_filename,
                                 bool elide_large_elements_attrs) {
@@ -212,20 +199,6 @@ tensorflow::Status ExportModule(mlir::ModuleOp module,
     return tensorflow::errors::Aborted("Unable to write to output path.");
   }
 
-  // Export TFLite Flatbuffer as output
-  if (export_type == "tflite") {
-    std::string flatbuffer_str;
-    auto status =
-        mlir::odml::ConvertStableHLOToFlatbuffer(module, &flatbuffer_str);
-    if (!status.ok()) {
-      return status;
-    }
-
-    output->os() << flatbuffer_str;
-    output->keep();
-    return ::tensorflow::OkStatus();
-  }
-
   // Export StableHLO MLIR as output
   std::string result;
   llvm::raw_string_ostream os(result);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD
deleted file mode 100644
index a93ec34c1bfa81..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/BUILD
+++ /dev/null
@@ -1,64 +0,0 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//visibility:public",
-    ],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "flatbuffer_translator",
-    srcs = [
-        "flatbuffer_translator.cc",
-    ],
-    hdrs = [
-        "flatbuffer_operator.h",
-        "flatbuffer_translator.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/stablehlo/schema:schema_fbs",
-        "//tensorflow/lite/toco:toco_flags_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_xla//xla:statusor",
-        "@stablehlo//:stablehlo_ops",
-    ],
-)
-
-cc_library(
-    name = "flatbuffer_export",
-    srcs = [
-        "flatbuffer_export.cc",
-    ],
-    hdrs = ["flatbuffer_export.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":flatbuffer_translator",
-        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/lite/toco:toco_flags_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//mlir:IR",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc
deleted file mode 100644
index a35f7821e68bbd..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h"
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h"
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace mlir {
-namespace odml {
-
-bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
-                                       const FlatbufferExportOptions& options,
-                                       std::string* serialized_flatbuffer) {
-  auto maybe_translated = Translator::Translate(
-      module, options.toco_flags, options.saved_model_tags,
-      options.op_or_arg_name_mapper, options.metadata);
-  if (!maybe_translated) return false;
-  *serialized_flatbuffer = std::move(*maybe_translated);
-  return true;
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h
deleted file mode 100644
index ae980f6f6522ad..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
-
-#include <map>
-#include <string>
-#include <unordered_set>
-
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-
-namespace mlir {
-namespace odml {
-
-// Options for exporting to Flatbuffer.
-struct FlatbufferExportOptions {
-  // TocoFlags proto. The following fields are migrated.
-  // bool emit_builtin_tflite_ops  -> !toco_flags.force_select_tf_ops()
-  // bool emit_select_tf_ops       -> toco_flags.enable_select_tf_ops()
-  // bool emit_custom_ops          -> toco_flags.allow_custom_ops()
-  // bool allow_all_select_tf_ops  -> toco_flags.allow_all_select_tf_ops()
-  // std::set<> select_user_tf_ops -> toco_flags.select_user_tf_ops()
-  toco::TocoFlags toco_flags;
-  // When exporting from SavedModel, this will have the requested tags.
-  std::unordered_set<std::string> saved_model_tags;
-  // Metadata key/value pairs to write to the flatbuffer.
-  std::map<std::string, std::string> metadata;
-  // OpOrArgNameMapper to convert location of the op to name in flatbuffer.
-  // If not set, a default mapper will be used.
-  tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper = nullptr;
-};
-
-// Translates the given MLIR `module` into a FlatBuffer and stores the
-// serialized flatbuffer into the string.
-// Returns true on successful exporting, false otherwise.
-bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
-                                       const FlatbufferExportOptions& options,
-                                       std::string* serialized_flatbuffer);
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h
deleted file mode 100644
index 453f7f508c39d6..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// prototype for stablehlo serialization, WIP
-// WARNING: converting to stablehlo file is experimental feature, and no runtime
-// support is provided
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
-
-#include <cstdint>
-#include <optional>
-#include <vector>
-
-#include "llvm/ADT/APInt.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-
-namespace mlir {
-namespace odml {
-
-// TODO(zichuanwei@): support float16/bfloat16 & int4
-
-// Function calls with a non-specialized type will result to a linker error.
-template <typename T>
-inline std::vector<T> GetVector(DenseElementsAttr elements);
-
-// TODO(zichuanwei@): for each type, we need to make sure the element type
-// matches the expected type otherwise an error should be thrown, but for now
-// we're just returning empty vector
-template <>
-inline std::vector<bool> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(1)) {
-    auto vec = llvm::to_vector(
-        llvm::map_range(elements.getValues<bool>(),
-                        [&](bool value) -> uint8_t { return value ? 1 : 0; }));
-    return std::vector<bool>(vec.begin(), vec.end());
-  }
-
-  return std::vector<bool>();
-}
-
-template <>
-inline std::vector<int8_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(8)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int8_t { return value.getSExtValue(); }));
-    return std::vector<int8_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int8_t>();
-}
-
-template <>
-inline std::vector<int16_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(16)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int16_t { return value.getSExtValue(); }));
-    return std::vector<int16_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int16_t>();
-}
-
-template <>
-inline std::vector<int32_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(32)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int32_t { return value.getSExtValue(); }));
-    return std::vector<int32_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int32_t>();
-}
-
-template <>
-inline std::vector<int64_t> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isSignlessInteger(64)) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APInt>(),
-        [&](APInt value) -> int64_t { return value.getSExtValue(); }));
-    return std::vector<int64_t>(vec.begin(), vec.end());
-  }
-
-  return std::vector<int64_t>();
-}
-
-template <>
-inline std::vector<float> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isF32()) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APFloat>(),
-        [&](APFloat value) -> float { return value.convertToFloat(); }));
-    return std::vector<float>(vec.begin(), vec.end());
-  }
-
-  return std::vector<float>();
-}
-
-template <>
-inline std::vector<double> GetVector(DenseElementsAttr elements) {
-  auto type = elements.getType();
-  auto elemType = type.getElementType();
-  if (elemType.isF64()) {
-    auto vec = llvm::to_vector(llvm::map_range(
-        elements.getValues<APFloat>(),
-        [&](APFloat value) -> double { return value.convertToFloat(); }));
-    return std::vector<double>(vec.begin(), vec.end());
-  }
-
-  return std::vector<double>();
-}
-
-// Handles the case when the DenseElementsAttr doesn't exist, and when it
-// doesn't returns a vector of length `default_size` all with the same value
-// `default_value`.
-template <typename T>
-static inline std::vector<T> GetOptionalVector(
-    std::optional<DenseElementsAttr> elements, int64_t default_size,
-    int64_t default_value) {
-  if (elements.has_value()) {
-    return GetVector<T>(elements.value());
-  }
-  return std::vector<T>(default_size, default_value);
-}
-
-// Handles the case when the SmallVector doesn't exist, and when it
-// doesn't returns a vector of length `default_size` all with the same value
-// `default_value`.
-template <typename T>
-static inline std::vector<T> GetOptionalVector(
-    std::optional<ArrayRef<T>> values, int64_t default_size,
-    int64_t default_value) {
-  if (values.has_value()) {
-    return std::vector<T>(values->begin(), values->end());
-  }
-  return std::vector<T>(default_size, default_value);
-}
-
-}  // namespace odml
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_OPERATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
deleted file mode 100644
index fb5e2fadab907b..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.cc
+++ /dev/null
@@ -1,904 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// prototype for stablehlo serialization, WIP
-// WARNING: converting to stablehlo file is experimental feature, and no runtime
-// support is provided
-
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <optional>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_operator.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "xla/statusor.h"
-#include "tensorflow/lite/stablehlo/schema/schema_generated.h"
-
-#define kStablehloOptionalTensor (-1)
-
-using llvm::isa;
-using llvm::StringRef;
-using llvm::Twine;
-using mlir::ElementsAttr;
-using mlir::ModuleOp;
-using mlir::Operation;
-using mlir::StringAttr;
-using mlir::TensorType;
-using mlir::Value;
-using mlir::func::FuncOp;
-using tensorflow::OpOrArgLocNameMapper;
-using tensorflow::OpOrArgNameMapper;
-using xla::StatusOr;
-
-namespace mlir {
-namespace odml {
-
-// TODO(b/267689361) this and the following functions should be automatically
-// generated similar to operator_converters.inc in tflite
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateAddOperator(
-    mlir::stablehlo::AddOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateReshapeOperator(mlir::stablehlo::ReshapeOp& hlo_op,
-                      flatbuffers::FlatBufferBuilder* fbb,
-                      uint32_t opcode_index,
-                      const std::vector<int32_t>& operands,
-                      const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateDivOperator(
-    mlir::stablehlo::DivOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateSubtractOperator(mlir::stablehlo::SubtractOp& hlo_op,
-                       flatbuffers::FlatBufferBuilder* fbb,
-                       uint32_t opcode_index,
-                       const std::vector<int32_t>& operands,
-                       const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateMulOperator(
-    mlir::stablehlo::MulOp hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateMaxOperator(
-    mlir::stablehlo::MaxOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConvertOperator(mlir::stablehlo::ConvertOp& hlo_op,
-                      flatbuffers::FlatBufferBuilder* fbb,
-                      uint32_t opcode_index,
-                      const std::vector<int32_t>& operands,
-                      const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateDotOperator(
-    mlir::stablehlo::DotOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator> CreateClampOperator(
-    mlir::stablehlo::ClampOp& hlo_op, flatbuffers::FlatBufferBuilder* fbb,
-    uint32_t opcode_index, const std::vector<int32_t>& operands,
-    const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateLogisticOperator(mlir::stablehlo::LogisticOp& hlo_op,
-                       flatbuffers::FlatBufferBuilder* fbb,
-                       uint32_t opcode_index,
-                       const std::vector<int32_t>& operands,
-                       const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  return ::stablehlo::flatbuf::CreateOperator(*fbb, opcode_index, inputs,
-                                              outputs);
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConcatenateOperator(mlir::stablehlo::ConcatenateOp& hlo_op,
-                          flatbuffers::FlatBufferBuilder* fbb,
-                          uint32_t opcode_index,
-                          const std::vector<int32_t>& operands,
-                          const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto options = ::stablehlo::flatbuf::CreateConcatenateOptions(
-      *fbb, hlo_op.getDimension());
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ConcatenateOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateConvolutionOperator(mlir::stablehlo::ConvolutionOp& hlo_op,
-                          flatbuffers::FlatBufferBuilder* fbb,
-                          uint32_t opcode_index,
-                          const std::vector<int32_t>& operands,
-                          const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  // converting from mlir struct to std
-  std::vector<int64_t> window_strides_vec =
-      GetOptionalVector<int64_t>(hlo_op.getWindowStrides(), 0, 0);
-  std::vector<int64_t> padding_vec =
-      GetOptionalVector<int64_t>(hlo_op.getPadding(), 0, 0);
-  std::vector<int64_t> lhs_dilation_vec =
-      GetOptionalVector<int64_t>(hlo_op.getLhsDilation(), 0, 0);
-  std::vector<int64_t> rhs_dilation_vec =
-      GetOptionalVector<int64_t>(hlo_op.getRhsDilation(), 0, 0);
-  std::vector<bool> window_reversal_vec =
-      GetOptionalVector<bool>(hlo_op.getWindowReversal(), 0, 0);
-  const int64_t feature_group_count = hlo_op.getFeatureGroupCount();
-  const int64_t batch_group_count = hlo_op.getBatchGroupCount();
-
-  auto conv_dimension_numbers = hlo_op.getDimensionNumbersAttr();
-
-  std::vector<int64_t> input_spatial_dimensions_vec =
-      conv_dimension_numbers.getInputSpatialDimensions().vec();
-  std::vector<int64_t> kernel_spatial_dimensions_vec =
-      conv_dimension_numbers.getKernelSpatialDimensions().vec();
-  std::vector<int64_t> output_spatial_dimensions_vec =
-      conv_dimension_numbers.getOutputSpatialDimensions().vec();
-  const int64_t input_batch_dimension =
-      conv_dimension_numbers.getInputBatchDimension();
-  const int64_t input_feature_dimension =
-      conv_dimension_numbers.getInputFeatureDimension();
-  const int64_t kernel_input_feature_dimension =
-      conv_dimension_numbers.getKernelInputFeatureDimension();
-  const int64_t kernel_output_feature_dimension =
-      conv_dimension_numbers.getKernelOutputFeatureDimension();
-  const int64_t output_batch_dimension =
-      conv_dimension_numbers.getOutputBatchDimension();
-  const int64_t output_feature_dimension =
-      conv_dimension_numbers.getOutputFeatureDimension();
-
-  // serialize all vectors to flatbuffer
-  auto window_strides = fbb->CreateVector(window_strides_vec);
-  auto padding = fbb->CreateVector(padding_vec);
-  auto lhs_dilation = fbb->CreateVector(lhs_dilation_vec);
-  auto rhs_dilation = fbb->CreateVector(rhs_dilation_vec);
-  auto input_spatial_dimensions =
-      fbb->CreateVector(input_spatial_dimensions_vec);
-  auto kernel_spatial_dimensions =
-      fbb->CreateVector(kernel_spatial_dimensions_vec);
-  auto output_spatial_dimensions =
-      fbb->CreateVector(output_spatial_dimensions_vec);
-  auto window_reversal = fbb->CreateVector(window_reversal_vec);
-
-  auto options = ::stablehlo::flatbuf::CreateConvolutionOptions(
-      *fbb, window_strides, padding, lhs_dilation, rhs_dilation,
-      window_reversal, input_batch_dimension, input_feature_dimension,
-      input_spatial_dimensions, kernel_input_feature_dimension,
-      kernel_output_feature_dimension, kernel_spatial_dimensions,
-      output_batch_dimension, output_feature_dimension,
-      output_spatial_dimensions, feature_group_count, batch_group_count);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ConvolutionOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateReduceWindowOperator(mlir::stablehlo::ReduceWindowOp& hlo_op,
-                           flatbuffers::FlatBufferBuilder* fbb,
-                           uint32_t opcode_index,
-                           const std::vector<int32_t>& operands,
-                           const std::vector<int32_t>& results,
-                           const int subgraph_idx) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  // TODO(zichuanwei@): instead of create these vectors let's just create
-  // Flatbuffers vector directly
-  std::vector<int64_t> window_dimension_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowDimensions(), 0, 0));
-  std::vector<int64_t> window_strides_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowStrides(), 0, 0));
-  std::vector<int64_t> base_dilations_vec(
-      GetOptionalVector<int64_t>(hlo_op.getBaseDilations(), 0, 0));
-  std::vector<int64_t> window_dilations_vec(
-      GetOptionalVector<int64_t>(hlo_op.getWindowDilations(), 0, 0));
-  std::vector<int64_t> padding_vec(
-      GetOptionalVector<int64_t>(hlo_op.getPadding(), 0, 0));
-
-  auto window_dimension = fbb->CreateVector(window_dimension_vec);
-  auto window_strides = fbb->CreateVector(window_strides_vec);
-  auto base_dilations = fbb->CreateVector(base_dilations_vec);
-  auto window_dilations = fbb->CreateVector(window_dilations_vec);
-  auto padding = fbb->CreateVector(padding_vec);
-
-  auto options = ::stablehlo::flatbuf::CreateReduceWindowOptions(
-      *fbb, window_dimension, window_strides, base_dilations, window_dilations,
-      padding, subgraph_idx);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ReduceWindowOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateBroadcastInDimOperator(mlir::stablehlo::BroadcastInDimOp& hlo_op,
-                             flatbuffers::FlatBufferBuilder* fbb,
-                             uint32_t opcode_index,
-                             const std::vector<int32_t>& operands,
-                             const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto dims = hlo_op.getBroadcastDimensions();
-  auto broadcast_dimension =
-      fbb->CreateVector(std::vector<int64_t>(dims.begin(), dims.end()));
-
-  auto options = ::stablehlo::flatbuf::CreateBroadcastInDimOptions(
-      *fbb, broadcast_dimension);
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_BroadcastInDimOptions,
-      options.Union());
-}
-
-static flatbuffers::Offset<::stablehlo::flatbuf::Operator>
-CreateResizeBilinearOperator(mlir::stablehlo::CustomCallOp& hlo_op,
-                             flatbuffers::FlatBufferBuilder* fbb,
-                             uint32_t opcode_index,
-                             const std::vector<int32_t>& operands,
-                             const std::vector<int32_t>& results) {
-  auto inputs = fbb->CreateVector(operands);
-  auto outputs = fbb->CreateVector(results);
-
-  auto align_corners =
-      hlo_op->getAttr("align_corners").dyn_cast<mlir::BoolAttr>();
-  assert(align_corners);
-  auto half_pixel_center =
-      hlo_op->getAttr("half_pixel_centers").dyn_cast<mlir::BoolAttr>();
-  assert(half_pixel_center);
-
-  auto options = ::stablehlo::flatbuf::CreateResizeBilinearOptions(
-      *fbb, align_corners.getValue(), half_pixel_center.getValue());
-
-  return ::stablehlo::flatbuf::CreateOperator(
-      *fbb, opcode_index, inputs, outputs,
-      ::stablehlo::flatbuf::OperatorOptions_ResizeBilinearOptions,
-      options.Union());
-}
-
-std::optional<flatbuffers::Offset<::stablehlo::flatbuf::Operator>>
-CreateFlatBufferOperator(mlir::Operation* op, uint32_t opcode_index,
-                         const std::vector<int32_t>& operands,
-                         const std::vector<int32_t>& results,
-                         flatbuffers::FlatBufferBuilder* fbb,
-                         int subgraph_idx = 0) {
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::AddOp>(op))
-    return CreateAddOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::DotOp>(op))
-    return CreateDotOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::LogisticOp>(op))
-    return CreateLogisticOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::DivOp>(op))
-    return CreateDivOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::SubtractOp>(op))
-    return CreateSubtractOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::MulOp>(op))
-    return CreateMulOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::MaxOp>(op))
-    return CreateMaxOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ReshapeOp>(op))
-    return CreateReshapeOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConvolutionOp>(op))
-    return CreateConvolutionOperator(hlo_op, fbb, opcode_index, operands,
-                                     results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(op))
-    return CreateReduceWindowOperator(hlo_op, fbb, opcode_index, operands,
-                                      results, subgraph_idx);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::BroadcastInDimOp>(op))
-    return CreateBroadcastInDimOperator(hlo_op, fbb, opcode_index, operands,
-                                        results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::CustomCallOp>(op))
-    return CreateResizeBilinearOperator(hlo_op, fbb, opcode_index, operands,
-                                        results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ClampOp>(op))
-    return CreateClampOperator(hlo_op, fbb, opcode_index, operands, results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConcatenateOp>(op))
-    return CreateConcatenateOperator(hlo_op, fbb, opcode_index, operands,
-                                     results);
-  if (auto hlo_op = llvm::dyn_cast<mlir::stablehlo::ConvertOp>(op))
-    return CreateConvertOperator(hlo_op, fbb, opcode_index, operands, results);
-  return std::nullopt;
-}
-
-static absl::StatusOr<::stablehlo::flatbuf::DataType> GetDataType(
-    Type type, bool is_signed = true) {
-  if (type.isF16()) return ::stablehlo::flatbuf::DataType_FLOAT16;
-  if (type.isF32()) return ::stablehlo::flatbuf::DataType_FLOAT32;
-  if (type.isF64()) return ::stablehlo::flatbuf::DataType_FLOAT64;
-  if (type.isSignlessInteger(8)) return ::stablehlo::flatbuf::DataType_INT8;
-  if (type.isSignlessInteger(16)) return ::stablehlo::flatbuf::DataType_INT16;
-  if (type.isSignlessInteger(32)) return ::stablehlo::flatbuf::DataType_INT32;
-  if (type.isSignlessInteger(64)) return ::stablehlo::flatbuf::DataType_INT64;
-  if (type.isUnsignedInteger(8)) return ::stablehlo::flatbuf::DataType_UINT8;
-  if (type.isUnsignedInteger(16)) return ::stablehlo::flatbuf::DataType_UINT16;
-  if (type.isUnsignedInteger(32)) return ::stablehlo::flatbuf::DataType_UINT32;
-  if (type.isUnsignedInteger(64)) return ::stablehlo::flatbuf::DataType_UINT64;
-  std::string type_str;
-  llvm::raw_string_ostream str_stream(type_str);
-  str_stream << type;
-  LOG(ERROR) << "unsupported datatype" << type_str;
-  return tensorflow::errors::InvalidArgument("unsupported datatype" + type_str);
-}
-
-std::optional<::stablehlo::flatbuf::OperatorCode> GetOpCode(
-    mlir::Operation* op) {
-  if (isa<mlir::stablehlo::AddOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_ADD;
-  if (isa<mlir::stablehlo::DotOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_DOT;
-  if (isa<mlir::stablehlo::SubtractOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_SUBTRACT;
-  if (isa<mlir::stablehlo::DivOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_DIVIDE;
-  if (isa<mlir::stablehlo::LogisticOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_LOGISTIC;
-  if (isa<mlir::stablehlo::MulOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_MULTIPLY;
-  if (isa<mlir::stablehlo::MaxOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_MAXIMUM;
-  if (isa<mlir::stablehlo::ReshapeOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_RESHAPE;
-  if (isa<mlir::stablehlo::ConvolutionOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONVOLUTION;
-  if (isa<mlir::stablehlo::BroadcastInDimOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_BROADCAST_IN_DIM;
-  if (isa<mlir::stablehlo::ReduceWindowOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_REDUCE_WINDOW;
-  if (isa<mlir::stablehlo::ClampOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CLAMP;
-  if (isa<mlir::stablehlo::ConcatenateOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONCATENATE;
-  if (isa<mlir::stablehlo::ConvertOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_CONVERT;
-
-  // For now we assume the incoming custom op is a resize_bilinear, it is
-  // expected any other custom op will cause the program to error out
-  if (isa<mlir::stablehlo::CustomCallOp>(op))
-    return ::stablehlo::flatbuf::OperatorCode_RESIZE_BILINEAR;
-
-  op->emitError(Twine("unsupported op type " + op->getName().getStringRef()));
-  return std::nullopt;
-}
-
-static bool IsConst(Operation* op) {
-  return isa<mlir::func::ConstantOp, mlir::arith::ConstantOp,
-             mlir::stablehlo::ConstantOp>(op);
-}
-
-std::optional<std::string> Translator::Translate(
-    ModuleOp module, const toco::TocoFlags& toco_flags,
-    const std::unordered_set<std::string>& tags,
-    OpOrArgNameMapper* op_or_arg_name_mapper,
-    const std::map<std::string, std::string>& metadata) {
-  OpOrArgLocNameMapper default_op_or_arg_name_mapper;
-  if (!op_or_arg_name_mapper)
-    op_or_arg_name_mapper = &default_op_or_arg_name_mapper;
-  // TODO(b/267689626): sanity checkers not implemented
-  Translator translator(module, toco_flags, tags, op_or_arg_name_mapper,
-                        metadata);
-  return translator.TranslateInternal();
-}
-
-std::optional<std::string> Translator::TranslateInternal() {
-  // A list of named regions in the module with main function being the first in
-  // the list. The main function is required as the first subgraph in the model
-  // is entry point for the model.
-  std::vector<std::pair<std::string, Region*>> named_regions;
-  named_regions.reserve(std::distance(module_.begin(), module_.end()));
-
-  int subgraph_idx = 0;
-
-  // Entry functions for signature defs.
-  std::vector<FuncOp> entry_functions;
-  std::vector<FuncOp> non_entry_functions;
-  FuncOp main_fn = module_.lookupSymbol<FuncOp>("main");
-  if (main_fn != nullptr) {
-    // Treat the main function as a signature def when the given main function
-    // contains on the tf.entry_function attribute.
-    auto attrs =
-        main_fn->getAttrOfType<mlir::DictionaryAttr>(tf_entry_function_);
-    if (attrs && !attrs.empty()) {
-      entry_functions.push_back(main_fn);
-    } else {
-      non_entry_functions.push_back(main_fn);
-    }
-  }
-
-  // Walk over the module collection ops with functions and while ops.
-  module_.walk([&](FuncOp fn) {
-    if (main_fn == fn) return WalkResult::advance();
-    auto attrs = fn->getAttrOfType<mlir::DictionaryAttr>("tf.entry_function");
-    if (attrs && !attrs.empty()) {
-      entry_functions.push_back(fn);
-    } else {
-      non_entry_functions.push_back(fn);
-    }
-    return WalkResult::advance();
-  });
-
-  // collect all reduce window ops, this is only a temporary hack
-  // in the future, we should have a function to walk over all ops that have
-  // regions contained, the logic in stablehlo is a bit different from tfl
-  // dialect in that all subgraphs in tflite a enclosed in func op where
-  // stablehlo op maintain their own regions
-  std::vector<mlir::stablehlo::ReduceWindowOp> reduce_window;
-  module_.walk([&](mlir::stablehlo::ReduceWindowOp op) {
-    reduce_window.push_back(op);
-    return WalkResult::advance();
-  });
-
-  // Assign the subgraph index. Among the given functions, it will put entry
-  // functions at the beginning of the list of the subgrahs.
-  for (auto fn : entry_functions) {
-    subgraph_index_map_[fn.getName().str()] = subgraph_idx++;
-    named_regions.emplace_back(fn.getName().str(), &fn.getBody());
-  }
-  for (auto fn : non_entry_functions) {
-    subgraph_index_map_[fn.getName().str()] = subgraph_idx++;
-    named_regions.emplace_back(fn.getName().str(), &fn.getBody());
-  }
-
-  // add regions of reduce_window ops into subgraph map. the name will be
-  // stablehlo.reduce_window as mlir::region is not assicoate with a name
-  for (auto op : reduce_window) {
-    reduce_window_subgraph_map_[op] = subgraph_idx++;
-    named_regions.emplace_back(op.getOperationName().str(), &op.getBody());
-  }
-
-  // Build subgraph for each of the named regions.
-  std::vector<BufferOffset<::stablehlo::flatbuf::SubGraph>> subgraphs;
-  subgraphs.reserve(named_regions.size());
-  int first_failed_func = -1;
-
-  // When we export each function in the module op, intentionally, we export the
-  // entry functions at the beginning of the subgraph list and the
-  // subgraph_index is the index in entry functions and at the same, is the
-  // index in the subgraph list.
-  int subgraph_index = 0;
-  for (const auto& it : llvm::enumerate(named_regions)) {
-    auto subgraph_or =
-        BuildSubGraph(it.value().first, it.value().second, subgraph_index);
-    if (!subgraph_or) {
-      if (first_failed_func == -1)
-        // Record the index of the first region that cannot be converted.
-        // Keep looping through all subgraphs in the module to make sure that
-        // we collect the list of missing ops from the entire module.
-        first_failed_func = it.index();
-    } else {
-      subgraphs.push_back(*subgraph_or);
-      ++subgraph_index;
-    }
-  }
-  // TODO(b/267801705) : Add schema version
-  auto model = ::stablehlo::flatbuf::CreateModel(
-      builder_, 0, builder_.CreateVector(opcodes_),
-      builder_.CreateVector(subgraphs), builder_.CreateVector(buffers_));
-  ::stablehlo::flatbuf::FinishModelBuffer(builder_, model);
-  // There is a limit of 2GB for a flatbuffer.
-  if (builder_.GetSize() > 2147483648) {
-    LOG(ERROR) << "Model size is bigger than 2gb";
-    return std::nullopt;
-  }
-
-  // Return serialized string for the built FlatBuffer.
-  return std::string(reinterpret_cast<const char*>(builder_.GetBufferPointer()),
-                     builder_.GetSize());
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Tensor>>
-Translator::BuildTensor(Value value, const std::string& name,
-                        unsigned buffer_idx) {
-  auto type = value.getType().cast<TensorType>();
-
-  auto check_shape =
-      [&](llvm::ArrayRef<int64_t> shape_ref) -> mlir::LogicalResult {
-    auto is_out_of_range = [](int64_t dim) {
-      return dim > std::numeric_limits<int32_t>::max();
-    };
-
-    if (std::any_of(shape_ref.begin(), shape_ref.end(), is_out_of_range))
-      return mlir::emitError(
-          value.getLoc(),
-          "result shape dimensions out of 32 bit int type range");
-
-    return mlir::success();
-  };
-
-  std::vector<int32_t> shape;
-  std::vector<int32_t> shape_signature;
-  auto* inst = value.getDefiningOp();
-
-  bool is_variable = !(inst && IsConst(inst));
-  if (type.hasStaticShape()) {
-    llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-  } else if (inst && IsConst(inst)) {
-    // Const op can have a result of dynamic shaped type (e.g. due to constant
-    // folding), but we can still derive the shape of a constant tensor for
-    // its attribute type.
-
-    auto tensor_attr = inst->getAttr("value").cast<mlir::TypedAttr>();
-    llvm::ArrayRef<int64_t> shape_ref =
-        tensor_attr.getType().cast<TensorType>().getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape = std::vector<int32_t>(shape_ref.begin(), shape_ref.end());
-  } else if (type.hasRank()) {
-    llvm::ArrayRef<int64_t> shape_ref = type.getShape();
-    if (mlir::failed(check_shape(shape_ref))) return std::nullopt;
-
-    shape.reserve(shape_ref.size());
-    for (auto& dim : shape_ref) {
-      // translate dynamic shapes from mlir to tfl values
-      shape.push_back(
-          dim == mlir::ShapedType::kDynamic ? 1 : static_cast<int>(dim));
-      shape_signature.push_back(static_cast<int>(
-          dim == mlir::ShapedType::kDynamic ? tensorflow::kTFDynamicSize
-                                            : dim));
-    }
-  }
-
-  Type element_type = type.getElementType();
-  auto status = GetDataType(element_type);
-  if (!status.ok()) return std::nullopt;
-  ::stablehlo::flatbuf::DataType data_type = GetDataType(element_type).value();
-
-  return ::stablehlo::flatbuf::CreateTensor(
-      builder_, builder_.CreateVector(shape), data_type,
-      (is_variable ? 0 : buffer_idx), builder_.CreateString(name));
-}
-
-void Translator::InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr) {
-  auto dict_attr = fn->getAttrOfType<mlir::DictionaryAttr>(tf_entry_function_);
-  if (!dict_attr) return;
-
-  llvm::SmallVector<llvm::StringRef, 2> input_names;
-  llvm::SmallVector<llvm::StringRef, 2> output_names;
-  if (auto str = dict_attr.get("inputs").dyn_cast_or_null<mlir::StringAttr>()) {
-    str.getValue().split(input_names, ',', /*MaxSplit=*/-1,
-                         /*KeepEmpty=*/false);
-    if (input_names.size() != fn.getNumArguments()) {
-      fn.emitWarning() << "invalid entry function specification";
-      return;
-    }
-    for (const auto& it : llvm::enumerate(fn.getArguments())) {
-      name_mapper_.InitOpName(it.value(), input_names[it.index()].trim());
-    }
-    *has_input_attr = true;
-  }
-
-  if (auto str =
-          dict_attr.get("outputs").dyn_cast_or_null<mlir::StringAttr>()) {
-    str.getValue().split(output_names, ',', /*MaxSplit=*/-1,
-                         /*KeepEmpty=*/false);
-    auto term = fn.back().getTerminator();
-    if (output_names.size() != term->getNumOperands()) {
-      fn.emitWarning() << "output names (" << output_names.size()
-                       << ") != terminator operands (" << term->getNumOperands()
-                       << ")";
-      return;
-    }
-    for (const auto& it : llvm::enumerate(term->getOperands())) {
-      name_mapper_.InitOpName(it.value(), output_names[it.index()].trim());
-    }
-  }
-}
-
-std::string Translator::UniqueName(mlir::Value val) {
-  return std::string(name_mapper_.GetUniqueName(val));
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::SubGraph>>
-Translator::BuildSubGraph(const std::string& name, Region* region, int index) {
-  bool has_input_attr = false;
-  if (auto fn = dyn_cast<FuncOp>(region->getParentOp())) {
-    InitializeNamesFromAttribute(fn, &has_input_attr);
-  }
-  std::vector<BufferOffset<::stablehlo::flatbuf::Tensor>> tensors;
-  llvm::DenseMap<Value, int> tensor_index_map;
-
-  // Builds tensor and buffer for argument or operation result. Returns false
-  // on failure.
-  auto build_tensor_and_buffer = [&](Value value, const int subgraph_index,
-                                     const std::string& tensor_name) {
-    // NoneType represents optional and may be skipped here.
-    if (value.getType().isa<NoneType>()) {
-      return true;
-    }
-
-    tensor_index_map.insert({value, tensors.size()});
-    tensor_index_map_[subgraph_index][tensor_name] = tensors.size();
-    auto tensor_or = BuildTensor(value, tensor_name, buffers_.size());
-    if (!tensor_or) return false;
-    tensors.push_back(*tensor_or);
-
-    if (value.getDefiningOp()) {
-      auto buffer_or = BuildBuffer(value);
-      if (!buffer_or) return false;
-      buffers_.push_back(*buffer_or);
-    } else {
-      // TODO(b/267802872): Tflite will create a buffer entry for every tensor
-      // regardless constant or not. in stablehlo serialization, we don't plan
-      // to keep this behaviour
-      buffers_.push_back(empty_buffer_);
-    }
-    return true;
-  };
-
-  std::vector<BufferOffset<::stablehlo::flatbuf::Operator>> operators;
-
-  // Maps positions of operations in bb to positions in operators
-  llvm::DenseMap<int, int> operation_index_to_operator_index;
-  std::vector<Operation*> operators_in_mlir;
-  auto& bb = region->front();
-
-  // Main function's arguments are first passed to `input` op so they don't
-  // have associated tensor and buffer. Build FlatBuffer tensor and buffer for
-  // other functions.
-  for (unsigned i = 0, e = bb.getNumArguments(); i < e; ++i) {
-    mlir::BlockArgument arg = bb.getArgument(i);
-    std::string tensor_name;
-    if (has_input_attr)
-      tensor_name = std::string(name_mapper_.GetUniqueName(arg));
-    if (tensor_name.empty()) tensor_name = absl::StrCat("arg", i);
-    if (!build_tensor_and_buffer(arg, index, tensor_name)) return std::nullopt;
-  }
-
-  bool failed_once = false;
-  for (const auto& item : llvm::enumerate(bb)) {
-    Operation& inst = item.value();
-    const int operation_index = item.index();
-    if (inst.hasTrait<mlir::OpTrait::IsTerminator>()) break;
-
-    for (auto val : inst.getResults()) {
-      std::string tensor_name = UniqueName(val);
-      // For "tfl.numeric_verify" op, the name is used to find out the original
-      // activation tensor rather than its own unique name in the visualization
-      // or debugging tools.
-      // auto builtin_code = GetOpCode(&inst);
-      if (!build_tensor_and_buffer(val, index, tensor_name))
-        return std::nullopt;
-    }
-
-    // Skip constant ops as they don't represent flatbuffer operator.
-    if (IsConst(&inst)) continue;
-
-    // Fetch operand and result tensor indices.
-    std::vector<int32_t> results;
-    results.reserve(inst.getNumResults());
-    for (auto result : inst.getResults()) {
-      results.push_back(tensor_index_map.lookup(result));
-    }
-    Operation* real_inst = &inst;
-    std::vector<int32_t> operands;
-    operands.reserve(real_inst->getNumOperands());
-    for (auto operand : real_inst->getOperands()) {
-      if (operand.getType().isa<NoneType>())
-        operands.push_back(kStablehloOptionalTensor);
-      else
-        operands.push_back(tensor_index_map.lookup(operand));
-    }
-
-    if (auto flat_operator = BuildOperator(real_inst, operands, results)) {
-      operation_index_to_operator_index.try_emplace(operation_index,
-                                                    operators.size());
-      operators.push_back(*flat_operator);
-      operators_in_mlir.push_back(real_inst);
-    } else {
-      failed_once = true;
-    }
-  }
-  if (index + 1 > subgraph_op_inst_map_.size()) {
-    subgraph_op_inst_map_.resize(index + 1);
-  }
-  subgraph_op_inst_map_[index] = operators_in_mlir;
-  if (failed_once) return std::nullopt;
-
-  // Get input and output tensor indices for the subgraph.
-  std::vector<int32_t> inputs, outputs;
-  for (auto arg : bb.getArguments()) {
-    inputs.push_back(tensor_index_map[arg]);
-  }
-  for (auto result : bb.getTerminator()->getOperands()) {
-    outputs.push_back(tensor_index_map[result]);
-  }
-  return ::stablehlo::flatbuf::CreateSubGraph(
-      builder_, builder_.CreateVector(tensors), builder_.CreateVector(inputs),
-      builder_.CreateVector(outputs), builder_.CreateVector(operators),
-      /*name=*/builder_.CreateString(name));
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Buffer>>
-Translator::BuildBuffer(mlir::Value value) {
-  auto inst = value.getDefiningOp();
-  ElementsAttr attr;
-
-  if (auto cst = dyn_cast<mlir::arith::ConstantOp>(inst)) {
-    // arith::ConstantOp have ElementAttr at this point due to validation of the
-    // TFLite module.
-    attr = cst.getValue().cast<ElementsAttr>();
-  } else if (auto cst = dyn_cast<mlir::stablehlo::ConstantOp>(inst)) {
-    attr = cst.getValue();
-  } else {
-    return empty_buffer_;
-  }
-
-  tensorflow::Tensor tensor;
-  auto status = tensorflow::ConvertToTensor(attr, &tensor);
-  if (!status.ok()) {
-    inst->emitError(
-        Twine("failed to convert value attribute to tensor with error: " +
-              status.ToString()));
-    return std::nullopt;
-  }
-
-  absl::string_view tensor_data = tensor.tensor_data();
-  auto buffer_data = builder_.CreateVector(
-      reinterpret_cast<const uint8_t*>(tensor_data.data()), tensor_data.size());
-  return ::stablehlo::flatbuf::CreateBuffer(builder_, buffer_data);
-}
-
-uint32_t Translator::GetOpcodeIndex(
-    const std::string& op_name, ::stablehlo::flatbuf::OperatorCode op_code) {
-  auto it = opcode_index_map_.insert({op_name, 0});
-
-  // If the insert succeeded, the opcode has not been created already. Create a
-  // new operator code and update its index value in the map.
-  if (it.second) {
-    it.first->second = opcodes_.size();
-    opcodes_.push_back(op_code);
-  }
-  return it.first->second;
-}
-
-std::optional<BufferOffset<::stablehlo::flatbuf::Operator>>
-Translator::BuildOperator(Operation* inst, std::vector<int32_t> operands,
-                          const std::vector<int32_t>& results) {
-  const auto* dialect = inst->getDialect();
-  if (!dialect) {
-    inst->emitOpError("dialect is not registered");
-    return std::nullopt;
-  }
-
-  if (dialect == stablehlo_dialect_) {
-    auto op_code = GetOpCode(inst);
-    if (op_code == std::nullopt) {
-      return inst->emitOpError("op code not found"), std::nullopt;
-    }
-
-    auto opcode_index =
-        GetOpcodeIndex(inst->getName().getStringRef().str(), op_code.value());
-    std::optional<flatbuffers::Offset<::stablehlo::flatbuf::Operator>> offset;
-    if (op_code == ::stablehlo::flatbuf::OperatorCode_REDUCE_WINDOW) {
-      offset = CreateFlatBufferOperator(
-          inst, opcode_index, operands, results, &builder_,
-          reduce_window_subgraph_map_
-              [llvm::dyn_cast<mlir::stablehlo::ReduceWindowOp>(inst)]);
-    } else {
-      offset = CreateFlatBufferOperator(inst, opcode_index, operands, results,
-                                        &builder_);
-    }
-    if (!offset) {
-      inst->emitOpError("is not a supported stablehlo op");
-    }
-    return offset;
-  }
-
-  return inst->emitOpError("a stableHLO op"), std::nullopt;
-}
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h b/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h
deleted file mode 100644
index d9d1b7b0a17d81..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_translator.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <optional>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/lite/stablehlo/schema/schema_generated.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-
-template <typename T>
-using BufferOffset = flatbuffers::Offset<T>;
-
-template <typename T>
-using VectorBufferOffset = flatbuffers::Offset<flatbuffers::Vector<T>>;
-
-using CustomOptionsOffset = VectorBufferOffset<uint8_t>;
-
-// Use initial buffer size in flatbuffer builder to be same as the initial size
-// used by the TOCO export. (It does not explain rationale for this choice.)
-// This number is currently inherited from Tflite
-constexpr size_t kInitialBufferSize = 10240;
-
-namespace mlir {
-namespace odml {
-
-// Translates an MLIR module in mhlo dialect to TFLite FlatBuffer.
-class Translator {
- public:
-  // Translates the given MLIR module into TFLite FlatBuffer format and returns
-  // the serialized output. Returns std::nullopt on unsupported, invalid inputs
-  // or internal error.
-  static std::optional<std::string> Translate(
-      ModuleOp module, const toco::TocoFlags& toco_flags,
-      const std::unordered_set<std::string>& tags,
-      tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper,
-      const std::map<std::string, std::string>& metadata);
-
- private:
-  enum class OpType : char { kStablehloOp };
-  explicit Translator(ModuleOp module, const toco::TocoFlags& toco_flags,
-                      const std::unordered_set<std::string>& saved_model_tags,
-                      tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper,
-                      const std::map<std::string, std::string>& metadata)
-      : module_(module),
-        name_mapper_(*op_or_arg_name_mapper),
-        builder_(kInitialBufferSize),
-        saved_model_tags_(saved_model_tags) {
-    // The first buffer must be empty according to the schema definition.
-    empty_buffer_ = ::stablehlo::flatbuf::CreateBuffer(builder_);
-    buffers_.push_back(empty_buffer_);
-    stablehlo_dialect_ =
-        module.getContext()
-            ->getOrLoadDialect<mlir::stablehlo::StablehloDialect>();
-    // Right now the TF executor dialect is still needed to build NodeDef.
-    module.getContext()
-        ->getOrLoadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-  }
-
-  std::optional<std::string> TranslateInternal();
-
-  // Returns TFLite buffer populated with constant value if the operation is
-  // TFLite constant operation. Otherwise, returns an empty buffer. Emits error
-  // and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Buffer>> BuildBuffer(
-      Value value);
-
-  // Builds TFLite tensor from the given value. `buffer_idx` is index of the
-  // corresponding buffer. Emits error and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Tensor>> BuildTensor(
-      Value value, const std::string& name, unsigned buffer_idx);
-
-  // Returns opcode index for op identified by the op_name, if already
-  // available. Otherwise, creates a new OperatorCode using the given `builtin`
-  // operator and associates it with `op_name`.
-  uint32_t GetOpcodeIndex(const std::string& op_name,
-                          ::stablehlo::flatbuf::OperatorCode op_code);
-
-  // Builds operator for the given operation with specified operand and result
-  // tensor indices. Emits an error and returns std::nullopt on failure.
-  std::optional<BufferOffset<::stablehlo::flatbuf::Operator>> BuildOperator(
-      Operation* inst, std::vector<int32_t> operands,
-      const std::vector<int32_t>& results);
-
-  // Build a subgraph with a given name out of the region either corresponding
-  // to a function's body or while op. Modifies *region by calling
-  // ExtractControlEdges.
-  std::optional<BufferOffset<::stablehlo::flatbuf::SubGraph>> BuildSubGraph(
-      const std::string& name, Region* region, int index);
-
-  // Uses the tf.entry_function attribute (if set) to initialize the op to name
-  // mapping.
-  void InitializeNamesFromAttribute(mlir::func::FuncOp fn,
-                                    bool* has_input_attr);
-
-  // Returns a unique name for `val`.
-  std::string UniqueName(mlir::Value val);
-
-  ModuleOp module_;
-
-  tensorflow::OpOrArgNameMapper& name_mapper_;
-
-  flatbuffers::FlatBufferBuilder builder_;
-  BufferOffset<::stablehlo::flatbuf::Buffer> empty_buffer_;
-
-  std::vector<BufferOffset<::stablehlo::flatbuf::Buffer>> buffers_;
-  // Maps subgraph index and tensor name in the graph to the tensor index.
-  absl::flat_hash_map<int, absl::flat_hash_map<std::string, int>>
-      tensor_index_map_;
-
-  // Maps op name to index of the corresponding OperatorCode in opcodes_ vector.
-  absl::flat_hash_map<std::string, uint32_t> opcode_index_map_;
-  std::vector<int32_t> opcodes_;
-
-  // Maps function name to index of the corresponding subgraph in the FlatBuffer
-  // model.
-  absl::flat_hash_map<std::string, int> subgraph_index_map_;
-  absl::flat_hash_set<OpType> enabled_op_types_;
-
-  // maps between reduce_window op and their corresponding subgraphs
-  std::map<mlir::stablehlo::ReduceWindowOp, int> reduce_window_subgraph_map_;
-
-  // Points to stablehlo dialects & mhlo dialects, respectively. nullptr if the
-  // dialect is not registered.
-  Dialect* stablehlo_dialect_;
-
-  // Set of saved model tags, if any.
-  const std::unordered_set<std::string> saved_model_tags_;
-  // Map of key value pairs of metadata to export.
-  const std::map<std::string, std::string> metadata_;
-  // A mapping table to mlir::Operation objects for TFL subgraph and operator
-  // index in a flatbuffer.
-  std::vector<std::vector<Operation*>> subgraph_op_inst_map_;
-
-  const std::string tf_entry_function_ = "tf.entry_function";
-};
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_SERIALIZER_FLATBUFFER_TRANSLATOR_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
index 79cb17374fa940..dd691a25be14d9 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
new file mode 100644
index 00000000000000..5924d0dce396c4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/composite-lowering.mlir
@@ -0,0 +1,34 @@
+// RUN: odml-to-stablehlo-opt -composite-lowering -verify-diagnostics %s | FileCheck %s
+
+func.func @hardswish(%arg0: tensor<2xf32>) -> (tensor<*xf32>) {
+  %0 = mhlo.composite "aten.hardswish.default" %arg0 {decomposition = @XlaCallModule_aten.hardswish.default.impl_0} : (tensor<2xf32>) -> tensor<2xf32>
+  %1 = "tf.Identity"(%0) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
+  %2 = "tf.Identity"(%1) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+}
+func.func private @XlaCallModule_aten.hardswish.default.impl_0(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = mhlo.constant dense<6.000000e+00> : tensor<f32>
+  %1 = "mhlo.broadcast_in_dim"(%0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %2 = mhlo.constant dense<3.40282347E+38> : tensor<f32>
+  %3 = "mhlo.broadcast_in_dim"(%2) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %4 = mhlo.constant dense<3.000000e+00> : tensor<f32>
+  %5 = "mhlo.broadcast_in_dim"(%4) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %6 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %7 = "mhlo.broadcast_in_dim"(%6) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %8 = mhlo.constant dense<-3.40282347E+38> : tensor<f32>
+  %9 = "mhlo.broadcast_in_dim"(%8) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<2xf32>
+  %10 = mhlo.add %arg0, %5 : tensor<2xf32>
+  %11 = mhlo.clamp %7, %10, %3 : tensor<2xf32>
+  %12 = mhlo.clamp %9, %11, %1 : tensor<2xf32>
+  %13 = mhlo.multiply %arg0, %12 : tensor<2xf32>
+  %14 = mhlo.divide %13, %1 : tensor<2xf32>
+  return %14 : tensor<2xf32>
+}
+
+// CHECK-LABEL:   func.func @hardswish(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tfl.hard_swish"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Identity"(%[[VAL_1]]) {device = ""} : (tensor<2xf32>) -> tensor<*xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Identity"(%[[VAL_2]]) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_3]] : tensor<*xf32>
+// CHECK:         }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
deleted file mode 100644
index d0da1f09fa5ae1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | flatbuffer_translate -mlir-to-tflite-flatbuffer - -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:    return %1 : tensor<2xi32>
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
deleted file mode 100644
index b0eb02192f4dad..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-add.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
deleted file mode 100644
index 85653de898aa01..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0= "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-  func.return %0 : tensor<1x2x2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.broadcast_in_dim", custom_option = #tfl<const_bytes : "0x62726F6164636173745F64696D656E73696F6E73000201020119010101072C022401">} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
-// CHECK-NEXT:      return %0 : tensor<1x2x2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
deleted file mode 100644
index 2d0051afde986b..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-clamp.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "stablehlo.clamp"(%arg0, %arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0, %arg0) {custom_code = "stablehlo.clamp", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
deleted file mode 100644
index 44b69ab933039f..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-compare.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-  %0 = stablehlo.compare LT, %arg0, %arg1 : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %1 = stablehlo.compare LT, %arg0, %arg1, TOTALORDER : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  %2 = stablehlo.compare GT, %arg2, %arg3 : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-  func.return %2 : tensor<2xi1>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>, %arg3: tensor<2xf32>) -> tensor<2xi1> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E00024C5400011A0101010814022401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %1 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D706172655F74797065000A544F54414C4F5244455200636F6D70617269736F6E5F646972656374696F6E00024C540002331B0201022A0A1414042401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK-NEXT:     %2 = "tfl.custom"(%arg2, %arg3) {custom_code = "stablehlo.compare", custom_option = #tfl<const_bytes : "0x636F6D70617269736F6E5F646972656374696F6E0002475400011A0101010814022401">} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xi1>
-// CHECK-NEXT:     return %2 : tensor<2xi1>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
new file mode 100644
index 00000000000000..41a94b929c0f47
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-composite.mlir
@@ -0,0 +1,37 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-composite-legalize-tfl-custom | FileCheck %s
+// RUN: tf_tfl_translate --enable-hlo-to-tf-conversion --input-mlir %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s --check-prefix=CHECK-ROUNDTRIP
+
+module {
+  func.func public @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>, %arg2: tensor<1x100x32x4xf32>,
+      %arg3: tensor<1x500x4x4xf32>, %arg4: tensor<1x500x4x4xf32>, %arg5: tensor<1x1x100x500xf32>, %arg6: tensor<f32>)
+      -> (tensor<3x3xf32>, tensor<1x100x32x4xf32>) {
+    // CHECK-ROUNDTRIP: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    // CHECK-ROUNDTRIP: %1 = "tfl.custom"(%arg2, %arg3, %arg4, %arg5, %arg6) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    %0 = func.call @test_kv_cache(%arg0, %arg1) : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %1 = func.call @test_sdpa(%arg2, %arg3, %arg4, %arg5, %arg6) : (tensor<1x100x32x4xf32>,  tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    return %0, %1 : tensor<3x3xf32>, tensor<1x100x32x4xf32>
+  }
+
+  // CHECK-LABEL: func.func private @test_kv_cache
+  func.func private @test_kv_cache(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    // CHECK: %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "odml.update_kv_cache", custom_option = #tfl<const_bytes : "0x6B765F63616368655F6D617800010E00020001000100F40105032501">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    %0 = stablehlo.composite "odml.update_kv_cache" %arg0, %arg1 {composite_attributes = {kv_cache_max = 500 : i64}, decomposition = @odml.update_kv_cache.impl} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+    return %0 : tensor<3x3xf32>
+  }
+  func.func private @odml.update_kv_cache.impl(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    // No decomposition provided for test case.
+    return %arg0 : tensor<3x3xf32>
+  }
+
+  // CHECK-LABEL: func.func private @test_sdpa
+  func.func private @test_sdpa(%arg0: tensor<1x100x32x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<1x500x4x4xf32>, %arg3: tensor<1x1x100x500xf32>, %arg4: tensor<f32>) -> tensor<1x100x32x4xf32> {
+    // CHECK:  %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) {custom_code = "odml.scaled_dot_product_attention", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    %0 = stablehlo.composite "odml.scaled_dot_product_attention" %arg0, %arg1, %arg2, %arg3, %arg4 {decomposition = @odml.scaled_dot_product_attention.impl} : (tensor<1x100x32x4xf32>, tensor<1x500x4x4xf32>, tensor<1x500x4x4xf32>, tensor<1x1x100x500xf32>, tensor<f32>) -> tensor<1x100x32x4xf32>
+    return %0 : tensor<1x100x32x4xf32>
+  }
+  func.func private @odml.scaled_dot_product_attention.impl(%arg0: tensor<1x100x32x4xf32>, %arg1: tensor<1x500x4x4xf32>, %arg2: tensor<1x500x4x4xf32>, %arg3: tensor<1x1x100x500xf32>, %arg4: tensor<f32>) -> tensor<1x100x32x4xf32> {
+    // No decomposition provided for test case.
+    return %arg0 : tensor<1x100x32x4xf32>
+  }
+
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
deleted file mode 100644
index 4be83175a417e1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-concat.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-  %1 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  func.return %1 : tensor<6x3xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func @main(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.concatenate", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E00010B0101010004022401">} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:      return %0 : tensor<6x3xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
-
-
-
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
deleted file mode 100644
index 62c2253869c725..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-constant.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main() -> tensor<2xf32> {
-  %0 = stablehlo.constant dense<2> : tensor<i32>
-  %1 = stablehlo.constant dense<[10.0, 11.0]> : tensor<2xf32>
-  func.return %1 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func.func @main() -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C75650001020109010101062C022401">} : () -> tensor<i32>
-// CHECK-NEXT:      %1 = "tfl.custom"() {custom_code = "stablehlo.constant", custom_option = #tfl<const_bytes : "0x76616C756500000002000000000020410000304101150101010D36022401">} : () -> tensor<2xf32>
-// CHECK-NEXT:      return %1 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
deleted file mode 100644
index aa7742c15e4c42..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck -dump-input always %s
-
-module {
-func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-  %0 = "stablehlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
-    dimension_numbers = #stablehlo.conv<raw
-      input_batch_dimension = 2,
-      input_feature_dimension = 3,
-      input_spatial_dimensions = [0, 1],
-      kernel_input_feature_dimension = 3,
-      kernel_output_feature_dimension = 2,
-      kernel_spatial_dimensions = [0, 1],
-      output_batch_dimension = 3,
-      output_feature_dimension = 0,
-      output_spatial_dimensions = [1, 2]
-    >, feature_group_count = 1 : i64, lhs_dilation = array<i64: 1, 1>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>], rhs_dilation = array<i64: 1, 1>, window_strides = array<i64: 1, 1>, window_reversal = array<i1: true, false>} :
-       (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-  func.return %0 : tensor<16x8x8x1xf32>
-}
-}
-
-// CHECK: module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F726576657273616C0002010077696E646F775F737472696465730002010109D3C28F7C6D613C2D1B09010901AC017A70493A28170428042C2C3C2C902C122401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK-NEXT:     return %0 : tensor<16x8x8x1xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
deleted file mode 100644
index ef715f778e8292..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-dot.mlir
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-  %0 = "stablehlo.dot"(%arg0, %arg1) {
-    dimension_numbers = #stablehlo.dot<
-      lhs_batching_dimensions = [0, 1],
-      rhs_batching_dimensions = [1, 2],
-      lhs_contracting_dimensions = [0, 1],
-      rhs_contracting_dimensions = [1, 2]
-    >} :
-       (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-  func.return %0 : tensor<72x512xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:    func.func @main(%arg0: tensor<72x2048xf32>, %arg1: tensor<2048x512xf32>) -> tensor<72x512xf32> {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.dot", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730002000104040201020404020001040402010204040414100C082828282801300101010D28022401">} : (tensor<72x2048xf32>, tensor<2048x512xf32>) -> tensor<72x512xf32>
-// CHECK-NEXT:    return %0 : tensor<72x512xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
deleted file mode 100644
index 47c716c0ca5243..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-  %0 = "stablehlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #stablehlo.gather<
-      offset_dims = [2],
-      collapsed_slice_dims = [0, 1],
-      start_index_map = [0, 1],
-      index_vector_dim = 2>,
-    indices_are_sorted = false,
-    slice_sizes = array<i64: 1, 1, 256>} :
-       (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-  func.return %0 : tensor<30x1x256xf32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tensor<30x1x256xf32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.gather", custom_option = #tfl<const_bytes : "0x64696D656E73696F6E5F6E756D626572730001020402000104040200010404040D0B070228282804696E64696365735F6172655F736F7274656400736C6963655F73697A65730000030001000100000103512A1803010337000F28042D062401">} : (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
-// CHECK-NEXT:     return %0 : tensor<30x1x256xf32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
deleted file mode 100644
index e8ccfcaee07805..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-max.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.maximum %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.maximum", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
deleted file mode 100644
index b4bcbc455f2d24..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-mul.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.multiply %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.multiply", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
deleted file mode 100644
index bffb1da2b07117..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-pad.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-  %0 = "stablehlo.pad"(%arg0, %arg1) {
-    edge_padding_low = array<i64: 1, 0>,
-    edge_padding_high = array<i64: 2, 3>,
-    interior_padding = array<i64: 0, 0>
-  } : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-  func.return %0 : tensor<11x131xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x128xf32>, %arg1: tensor<f32>) -> tensor<11x131xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.pad", custom_option = #tfl<const_bytes : "0x656467655F70616464696E675F6869676800020203656467655F70616464696E675F6C6F7700020100696E746572696F725F70616464696E6700020000033E2A17030103311E0B2C2C2C062401">} : (tensor<8x128xf32>, tensor<f32>) -> tensor<11x131xf32>
-// CHECK-NEXT:      return %0 : tensor<11x131xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
deleted file mode 100644
index 281f14bf8b844e..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-reshape.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = "stablehlo.reshape"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.reshape", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
deleted file mode 100644
index f352e19959cba1..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-rsqrt.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = "stablehlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0) {custom_code = "stablehlo.rsqrt", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK-NEXT:      return %0 : tensor<2xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
deleted file mode 100644
index 5bd79227f576b8..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-scatter.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-  %0 = "stablehlo.scatter"(%arg0, %arg1, %arg2) ({
-  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
-    "stablehlo.return"(%arg4) : (tensor<i32>) -> ()
-  }) {
-    scatter_dimension_numbers = #stablehlo.scatter<
-      update_window_dims = [],
-      inserted_window_dims = [0],
-      scatter_dims_to_operand_dims = [0],
-      index_vector_dim = 1>,
-    indices_are_sorted = false,
-    unique_indices = false} :
-       (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  func.return %0 : tensor<3xi32>
-}
-}
-
-// CHECK:      module {
-// CHECK-NEXT:   func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>, %arg2: tensor<1xi32>) -> tensor<3xi32> {
-// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1, %arg2) {custom_code = "stablehlo.scatter", custom_option = #tfl<const_bytes : "0x696E64696365735F6172655F736F7274656400736361747465725F64696D656E73696F6E5F6E756D626572730000010004010004040707050128282804756E697175655F696E646963657300034D3B12030103001F00042804062401">} : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
-// CHECK-NEXT:     return %0 : tensor<3xi32>
-// CHECK-NEXT:   }
-// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
deleted file mode 100644
index bc4f72fd2bcd48..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-sub.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.subtract %arg0, %arg0 : tensor<2xi32>
-  func.return %0 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %0 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
deleted file mode 100644
index 8898fac4288218..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: odml-to-stablehlo-opt %s -stablehlo-tfl | FileCheck %s
-
-module {
-func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  %0 = stablehlo.add %arg0, %arg0 : tensor<2xi32>
-  %1 = stablehlo.subtract %0, %arg0 : tensor<2xi32>
-  func.return %1 : tensor<2xi32>
-}
-}
-
-// CHECK:       module {
-// CHECK-NEXT:    func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK-NEXT:      %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK-NEXT:      return %1 : tensor<2xi32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
index ec8ab139054e63..4a0f6a5d5e673b 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/odml-to-stablehlo-smuggle-resize.mlir
@@ -1,10 +1,12 @@
 // RUN: odml_to_stablehlo %s -skip-resize -smuggle-disallowed-ops -o - | FileCheck %s
+// RUN: odml-to-stablehlo-opt %s --smuggle-disallowed-ops-pass | FileCheck %s --check-prefix=CHECK-OPT
 
 // CHECK-LABEL: @main
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 975 : i32}, tf_saved_model.semantics}  {
   func.func @serving_default(%arg0: tensor<1x32x32x128xf32> {tf_saved_model.index_path = ["a"]}) -> (tensor<1x64x64x128xf32> {tf_saved_model.index_path = ["b"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "c:0", outputs = "d:0"}, tf_saved_model.exported_names = ["serving_default"]} {
       %0  = "tf.Const"() {value = dense<[56, 904]> : tensor<2xi32>} : () -> tensor<2xi32>
       // CHECK: %1 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %0) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
+      // CHECK-OPT: %0 = stablehlo.custom_call @tf.ResizeBilinear(%arg0, %cst) {align_corners = false, device = "", half_pixel_centers = true} : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
       %1 = "tf.ResizeBilinear"(%arg0, %0) {
         align_corners = false, device = "", half_pixel_centers = true
       } : (tensor<1x32x32x128xf32>, tensor<2xi32>) -> tensor<1x64x64x128xf32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir
new file mode 100644
index 00000000000000..b2b12c4c47b579
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/stablehlo-custom-call-legalize-composite.mlir
@@ -0,0 +1,18 @@
+// RUN: odml-to-stablehlo-opt %s -stablehlo-custom-call-legalize-composite | FileCheck %s
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: @main
+  func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>) {
+    // CHECK: stablehlo.custom_call @foo
+    stablehlo.custom_call @foo() : () -> ()
+    // CHECK-NOT: stablehlo.custom_call
+    // CHECK: stablehlo.composite "odml.foo" %arg0, %arg1 {composite_attributes = {bar = 500 : i64}, decomposition = @foo.impl} : (tensor<1xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>)
+    %1:2 = stablehlo.custom_call @stablehlo.composite(%arg0, %arg1) {called_computations = [@foo.impl], composite.backend_config = {attributes = {bar = 500 : i64}, name = "odml.foo"}} : (tensor<1xf32>, tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>)
+    return
+  }
+  // CHECK-LABEL: func private @foo.impl
+  func.func private @foo.impl(%arg0: tensor<1xf32>, %arg1: tensor<2xf32>) -> (tensor<2xf32>, tensor<1xf32>) {
+    return %arg1, %arg0 : tensor<2xf32>, tensor<1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index 9a9ea66195f7cb..7107f7dcb08a45 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -186,12 +186,10 @@ func.func @convolution_upstream_srq_strides(%arg0: tensor<1x3x3x4x!quant.uniform
 }
 // CHECK-LABEL: convolution_upstream_srq_strides
 // CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
 // CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
 // CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG]], %[[CONST_0]]) : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<4x2xi32>) -> tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
 // Tests that the stride_w is set to 2.
-// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
@@ -766,7 +764,7 @@ func.func @conv_with_bias_same_padding_srq_depthwise(%arg0: tensor<1x4x5x3x!quan
 
 // -----
 
-// Tests that a quantized stablehlo.transpose is converted to tfl.transpose.
+// Tests that a quantized `stablehlo.transpose` is converted to `tfl.transpose`.
 
 func.func @transpose(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -783,19 +781,19 @@ func.func @transpose(
 
 // -----
 
-// Tests that a float stablehlo.transpose is not converted to tfl.transpose.
+// Tests that a float `stablehlo.transpose` is not converted to `tfl.transpose`.
 
-func.func @float_transpose(%arg0: tensor<2x3x4xf32>) -> tensor<4x3x2xf32> {
+func.func @transpose_float(%arg0: tensor<2x3x4xf32>) -> tensor<4x3x2xf32> {
   %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x4xf32>) -> tensor<4x3x2xf32>
   return %0 : tensor<4x3x2xf32>
 }
-// CHECK-LABEL: float_transpose
+// CHECK-LABEL: transpose_float
 // CHECK-NOT: tfl.transpose
 // CHECK: stablehlo.transpose
 
 // -----
 
-// Tests that a quantized stablehlo.reshape is converted to tfl.reshape.
+// Tests that a quantized `stablehlo.reshape` is converted to `tfl.reshape`.
 
 func.func @reshape(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -812,19 +810,19 @@ func.func @reshape(
 
 // -----
 
-// Tests that a float stablehlo.reshape is not converted to tfl.reshape.
+// Tests that a float `stablehlo.reshape` is not converted to `tfl.reshape`.
 
-func.func @float_reshape(%arg0: tensor<2x3x4xf32>) -> tensor<6x4xf32> {
+func.func @reshape_float(%arg0: tensor<2x3x4xf32>) -> tensor<6x4xf32> {
   %0 = stablehlo.reshape %arg0 : (tensor<2x3x4xf32>) -> tensor<6x4xf32>
   return %0 : tensor<6x4xf32>
 }
-// CHECK-LABEL: float_reshape
+// CHECK-LABEL: reshape_float
 // CHECK-NOT: tfl.reshape
 // CHECK: stablehlo.reshape
 
 // -----
 
-// Tests that a quantized stablehlo.select is converted to tfl.select_v2.
+// Tests that a quantized `stablehlo.select` is converted to `tfl.select_v2`.
 
 func.func @select(
     %arg0: tensor<1x3xi1>,
@@ -846,19 +844,20 @@ func.func @select(
 
 // -----
 
-// Tests that a float stablehlo.select is not converted to tfl.select_v2.
+// Tests that a float `stablehlo.select` is not converted to `tfl.select_v2`.
 
-func.func @float_select(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> {
+func.func @select_float(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> {
   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<1x3xi1>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
   return %0 : tensor<1x3xf32>
 }
-// CHECK-LABEL: float_select
+// CHECK-LABEL: select_float
 // CHECK-NOT: tfl.select_v2
 // CHECK: stablehlo.select
 
 // -----
 
-// Tests that a quantized stablehlo.concatenate is converted to tfl.concatenation.
+// Tests that a quantized `stablehlo.concatenate` is converted to
+// `tfl.concatenation`.
 
 func.func @concatenate(
     %arg0: tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -878,20 +877,21 @@ func.func @concatenate(
 
 // -----
 
-// Tests that a float stablehlo.concatenate is not converted to tfl.concatenation.
+// Tests that a float `stablehlo.concatenate` is not converted to
+// `tfl.concatenation`.
 
-func.func @float_concatenate(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x2xf32> {
+func.func @concatenate_float(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x2xf32> {
   %0 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x2xf32>, tensor<1x2xf32>) -> tensor<4x2xf32>
   return %0 : tensor<4x2xf32>
 }
-// CHECK-LABEL: float_concatenate
+// CHECK-LABEL: concatenate_float
 // CHECK-NOT: tfl.concatenation
 // CHECK: stablehlo.concatenate
 
 // -----
 
-// Tests that a quantized stablehlo.pad without interior padding is converted to
-// tfl.padv2.
+// Tests that a quantized `stablehlo.pad` without interior padding is
+// converted to `tfl.padv2`.
 
 func.func @pad_without_interior_padding(
     %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -913,8 +913,8 @@ func.func @pad_without_interior_padding(
 
 // -----
 
-// Tests that a quantized stablehlo.pad with interior padding is converted to
-// tfl.dilate and tfl.padv2.
+// Tests that a quantized `stablehlo.pad` with interior padding is converted to
+// `tfl.dilate` and `tfl.padv2`.
 
 func.func @pad_with_interior_padding(
     %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
@@ -939,20 +939,20 @@ func.func @pad_with_interior_padding(
 
 // -----
 
-// Tests that a float stablehlo.pad is not converted to tfl.padv2.
+// Tests that a float `stablehlo.pad` is not converted to `tfl.padv2`.
 
-func.func @float_pad(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<4x5xf32> {
+func.func @pad_float(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<4x5xf32> {
   %0 = stablehlo.pad %arg0, %arg1, low = [0, 1], high = [2, 1], interior = [0, 0] : (tensor<2x3xf32>, tensor<f32>) -> tensor<4x5xf32>
   return %0 : tensor<4x5xf32>
 }
-// CHECK-LABEL: float_pad
+// CHECK-LABEL: pad_float
 // CHECK-NOT: tfl.padv2
 // CHECK: stablehlo.pad
 
 // -----
 
-// Tests that a quantized stablehlo.slice is converted to tfl.slice when stride
-// is 1.
+// Tests that a quantized `stablehlo.slice` is converted to
+// `tfl.slice` when stride is 1.
 
 func.func @slice(
     %arg0: tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -975,8 +975,8 @@ func.func @slice(
 
 // -----
 
-// Tests that a quantized stablehlo.slice is converted to tfl.strided_slice when
-// stride is not 1.
+// Tests that a quantized `stablehlo.slice` is converted to `tfl.strided_slice`
+// when stride is not 1.
 
 func.func @strided_slice(
     %arg0: tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>
@@ -1003,9 +1003,9 @@ func.func @strided_slice(
 
 // -----
 
-// Tests that a float stablehlo.slice is not converted to tfl.slice.
+// Tests that a float `stablehlo.slice` is not converted to `tfl.slice`.
 
-func.func @float_slice(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
+func.func @slice_float(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
   %0 = "stablehlo.slice"(%arg0) {
     start_indices = array<i64: 1, 2>,
     limit_indices = array<i64: 3, 4>,
@@ -1013,15 +1013,15 @@ func.func @float_slice(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
   } : (tensor<3x4xf32>) -> tensor<2x2xf32>
   return %0 : tensor<2x2xf32>
 }
-// CHECK-LABEL: float_slice
+// CHECK-LABEL: slice_float
 // CHECK-NOT: tfl.slice
 // CHECK-NOT: tfl.strided_slice
 // CHECK: stablehlo.slice
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.broadcast_to.
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.broadcast_to`.
 
 func.func @broadcast_in_dim(
     %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1040,8 +1040,8 @@ func.func @broadcast_in_dim(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.transpose and tfl.broadcast_to when broadcast_dimensions is not in
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.transpose` and `tfl.broadcast_to` when `broadcast_dimensions` is not in
 // ascending order.
 
 func.func @broadcast_in_dim_with_transpose(
@@ -1064,8 +1064,8 @@ func.func @broadcast_in_dim_with_transpose(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.expand_dims and tfl.broadcast_to when input rank is smaller than output
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// tfl.expand_dims and `tfl.broadcast_to` when input rank is smaller than output
 // rank.
 
 func.func @broadcast_in_dim_with_expand(
@@ -1088,9 +1088,10 @@ func.func @broadcast_in_dim_with_expand(
 
 // -----
 
-// Tests that a quantized stablehlo.broadcast_in_dim is converted to
-// tfl.transpose, tfl.expand_dims and tfl.broadcast_to when broadcast_dimensions
-// is not in ascending order and input rank is smaller than output rank.
+// Tests that a quantized `stablehlo.broadcast_in_dim` is converted to
+// `tfl.transpose`, `tfl.expand_dims` and `tfl.broadcast_to` when
+// `broadcast_dimensions` is not in ascending order and input rank is smaller
+// than output rank.
 
 func.func @broadcast_in_dim_with_transpose_and_expand(
     %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
@@ -1114,15 +1115,16 @@ func.func @broadcast_in_dim_with_transpose_and_expand(
 
 // -----
 
-// Tests that a float stablehlo.broadcast_in_dim is not converted to tfl.broadcast_to.
+// Tests that a float `stablehlo.broadcast_in_dim` is not converted to
+// `tfl.broadcast_to`.
 
-func.func @float_broadcast_in_dim(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
+func.func @broadcast_in_dim_float(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
     broadcast_dimensions = array<i64: 0, 1>
   } : (tensor<1x2xf32>) -> tensor<3x2xf32>
   return %0 : tensor<3x2xf32>
 }
-// CHECK-LABEL: float_broadcast_in_dim
+// CHECK-LABEL: broadcast_in_dim_float
 // CHECK-NOT: tfl.broadcast_to
 // CHECK-NOT: tfl.transpose
 // CHECK-NOT: tfl.expand_dims
@@ -1130,8 +1132,8 @@ func.func @float_broadcast_in_dim(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max is converted to
-// tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window` with max is converted to
+// `tfl.max_pool_2d`.
 
 func.func @reduce_window_with_max(
   %arg0: tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1155,8 +1157,8 @@ func.func @reduce_window_with_max(
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max whose rank is not 4
-// is not converted to tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window `with max whose rank is not 4
+// is not converted to `tfl.max_pool_2d`.
 
 func.func @reduce_window_not_4d(
   %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1176,8 +1178,8 @@ func.func @reduce_window_not_4d(
 
 // -----
 
-// Test that a quantized stablehlo.reduce_window with max that takes multiple
-// inputs is not converted to tfl.max_pool_2d.
+// Tests that a quantized `stablehlo.reduce_window` with max that takes multiple
+// inputs is not converted to `tfl.max_pool_2d`.
 
 func.func @reduce_window_not_binary(
   %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1200,10 +1202,10 @@ func.func @reduce_window_not_binary(
 
 // -----
 
-// Test that a float stablehlo.reduce_window with max is not converted to
-// tfl.max_pool_2d.
+// Tests that a float `stablehlo.reduce_window` with max is not converted to
+// `tfl.max_pool_2d`.
 
-func.func @float_reduce_window_with_max(
+func.func @reduce_window_with_max_float(
   %arg0: tensor<2x9x10x3xf32>,
   %arg1: tensor<f32>
 ) -> tensor<2x4x3x3xf32> {
@@ -1215,13 +1217,14 @@ func.func @float_reduce_window_with_max(
   return %0 : tensor<2x4x3x3xf32>
 }
 
-// CHECK-LABEL: float_reduce_window_with_max
+// CHECK-LABEL: reduce_window_with_max_float
 // CHECK: stablehlo.reduce_window
 // CHECK-NOT: tfl.max_pool_2d
 
 // -----
 
-// Test that a quantized stablehlo.dynamic_reshape is converted to tfl.reshape.
+// Tests that a quantized `stablehlo.dynamic_reshape` is converted to
+// `tfl.reshape`.
 
 func.func @dynamic_reshape(
     %arg0: tensor<?x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1242,20 +1245,21 @@ func.func @dynamic_reshape(
 
 // -----
 
-// Test that a float stablehlo.dynamic_reshape is not converted to tfl.reshape.
+// Tests that a float `stablehlo.dynamic_reshape` is not converted to
+// `tfl.reshape`.
 
-func.func @float_dynamic_reshape(%arg0: tensor<?x3xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
+func.func @dynamic_reshape_float(%arg0: tensor<?x3xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
   %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<?x3xf32>, tensor<2xi32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
 
-// CHECK-LABEL: func @float_dynamic_reshape
+// CHECK-LABEL: func @dynamic_reshape_float
 // CHECK: stablehlo.dynamic_reshape
 // CHECK-NOT: tfl.reshape
 
 // -----
 
-// Test that a quantized stablehlo.gather is converted to tfl.gather_nd.
+// Tests that a quantized `stablehlo.gather` is converted to tfl.gather_nd.
 
 func.func @gather(
     %arg0: tensor<3x4x2x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1284,8 +1288,8 @@ func.func @gather(
 
 // -----
 
-// Test that a quantized stablehlo.gather with unsorted start_index_map is not
-// converted to tfl.gather_nd (condition 1 is not satisfied).
+// Tests that a quantized `stablehlo.gather` with unsorted start_index_map is
+// not converted to `tfl.gather_nd` (condition 1 is not satisfied).
 
 func.func @gather_start_index_map_not_sorted(
     %arg0: tensor<3x4x2x2x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
@@ -1313,7 +1317,7 @@ func.func @gather_start_index_map_not_sorted(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when index_vector_dim is not the last dimension of start_indices (condition 2
 // is not satisfied).
 
@@ -1343,7 +1347,7 @@ func.func @gather_start_index_vector_dim_not_at_last(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when offset_dims are not the last dimensions of the output (condition 3 is
 // not satisfied).
 
@@ -1373,7 +1377,7 @@ func.func @gather_offset_dims_not_at_last(
 
 // -----
 
-// Test that a quantized stablehlo.gather is not converted to tfl.gather_nd
+// Tests that a quantized `stablehlo.gather` is not converted to tfl.gather_nd
 // when shape of slice is not same with shape of offset (condition 4 is not
 // satisfied).
 
@@ -1403,9 +1407,9 @@ func.func @gather_different_slice_and_offset(
 
 // -----
 
-// Test that a float stablehlo.gather is not converted to tfl.gather_nd.
+// Tests that a float `stablehlo.gather` is not converted to `tfl.gather_nd`.
 
-func.func @float_gather(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
+func.func @gather_float(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32> {
   %0 = "stablehlo.gather"(%arg0, %arg1) {
     dimension_numbers = #stablehlo.gather<
       offset_dims = [2, 3],
@@ -1418,7 +1422,161 @@ func.func @float_gather(%arg0: tensor<3x4x2x2xf32>, %arg1: tensor<2x3x2xi64>) ->
   return %0 : tensor<2x3x2x2xf32>
 }
 
-// CHECK-LABEL: func @float_gather
+// CHECK-LABEL: func @gather_float
 // CHECK: stablehlo.gather
 // CHECK-NOT: tfl.gather_nd
 // CHECK-NOT: tfl.gather
+
+// -----
+
+// Tests that a quantized `stablehlo.dynamic_slice` is converted to `tfl.slice`.
+
+// CHECK-LABEL: func @dynamic_slice
+// CHECK-SAME: %[[ARG0:.+]]: tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, %[[ARG1:.+]]: tensor<i64>, %[[ARG2:.+]]: tensor<i64>
+func.func @dynamic_slice(
+    %arg0: tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+    %arg1: tensor<i64>,
+    %arg2: tensor<i64>
+  ) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>> {
+  %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
+    slice_sizes = array<i64: 2, 1>
+  } : (
+    tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<i64>,
+    tensor<i64>
+  ) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  return %0 : tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+}
+
+
+// CHECK-DAG: %[[SLICE_SIZE:.+]] = arith.constant dense<[2, 1]> : tensor<2xi64>
+// CHECK-DAG: %[[ZERO:.+]] = arith.constant dense<0> : tensor<1xi64>
+// CHECK-DAG: %[[MAX1:.+]] = arith.constant dense<2> : tensor<1xi64>
+// CHECK-DAG: %[[MAX2:.+]] = arith.constant dense<3> : tensor<1xi64>
+// CHECK: %[[BITCAST1:.+]] = "tfl.bitcast"(%[[ARG1]]) : (tensor<i64>) -> tensor<1xi64>
+// CHECK: %[[MIN1:.+]] = "tfl.minimum"(%[[BITCAST1]], %[[MAX1]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[BITCAST2:.+]] = "tfl.bitcast"(%[[ARG2]]) : (tensor<i64>) -> tensor<1xi64>
+// CHECK: %[[MIN2:.+]] = "tfl.minimum"(%[[BITCAST2]], %[[MAX2]]) : (tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%[[MIN1]], %[[MIN2]]) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[MAX:.+]] = "tfl.maximum"(%[[CONCAT]], %[[ZERO]]) : (tensor<2xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK: %[[SLICE:.+]] = "tfl.slice"(%[[ARG0]], %[[MAX]], %[[SLICE_SIZE]])
+// CHECK-SAME: (tensor<4x4x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<2xi64>, tensor<2xi64>) -> tensor<2x1x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+
+// -----
+
+// Tests that a float `stablehlo.dynamic_slice` is not converted to `tfl.slice`.
+
+func.func @dynamic_slice_float(%arg0: tensor<4x4xf32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x1xf32> {
+  %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {
+    slice_sizes = array<i64: 2, 1>
+  } : (tensor<4x4xf32>, tensor<i64>, tensor<i64>) -> tensor<2x1xf32>
+  return %0 : tensor<2x1xf32>
+}
+
+// CHECK-LABEL: func @dynamic_slice_float
+// CHECK: stablehlo.dynamic_slice
+// CHECK-NOT: tfl.bitcast
+// CHECK-NOT: tfl.minimum
+// CHECK-NOT: tfl.maximum
+// CHECK-NOT: tfl.slice
+
+// -----
+
+// Tests that `stablehlo.add` with both operands int8 UniformQuantizedType is
+// properly converted into `tfl.add`.
+
+func.func @add(%arg0: tensor<1x3x!quant.uniform<i8:f32, 1.000000e+0:8>>, %arg1: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:8>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>> {
+  %0 = stablehlo.add %arg0, %arg1 : (tensor<1x3x!quant.uniform<i8:f32, 1.000000e+0:8>>, tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:8>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>>
+  return %0 : tensor<1x3x!quant.uniform<i8:f32, 3.000000e+0:8>>
+}
+
+// CHECK-LABEL: func @add
+// CHECK: %[[ADD:.+]] = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[ADD]]
+
+// -----
+
+// Tests that `stablehlo.add` with int32 UniformQuantizedPerAxisTypes is
+// not converted.
+
+func.func @add_i32(%arg0: tensor<1x3x!quant.uniform<i32:f32:1, {1.000000e+0, 1.000000e+0, 1.000000e+0}>>, %arg1: tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>> {
+  %0 = stablehlo.add %arg0, %arg1 : (tensor<1x3x!quant.uniform<i32:f32:1, {1.000000e+0, 1.000000e+0, 1.000000e+0}>>, tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+  return %0 : tensor<1x3x!quant.uniform<i32:f32:1, {2.000000e+0, 2.000000e+0, 2.000000e+0}>>
+}
+
+// CHECK-LABEL: func @add_i32
+// CHECK: stablehlo.add
+// CHECK-NOT: tfl.add
+
+// -----
+
+// Tests that a quantized `stablehlo.constant` is converted into `tfl.qconst`.
+
+// CHECK-LABEL: func @quantized_constant
+func.func @quantized_constant() -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>> {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+  return %0 : tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+}
+
+// CHECK: %[[QCONST:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK-SAME: () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: return %[[QCONST]]
+
+// -----
+
+// Tests that a float `stablehlo.constant` is not converted into `tfl.qconst`.
+
+// CHECK-LABEL: func @float_constant
+func.func @float_constant() -> tensor<1x2x4x5xf32> {
+  %0 = stablehlo.constant() {value = dense<1.0> : tensor<1x2x4x5xf32>} : () -> tensor<1x2x4x5xf32>
+  return %0 : tensor<1x2x4x5xf32>
+}
+
+// CHECK: stablehlo.constant
+// CHECK-NOT: tfl.pseudo_qconst
+// CHECK-NOT: tfl.pseudo_const
+// CHECK-NOT: arith.constant
+
+// -----
+
+// Tests that a hybrid quantized dot_general is splitted into dequantize and float
+// dot_general.
+
+// CHECK-LABEL: func @dot_general_hybrid
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2x3x4xf32>
+func.func @dot_general_hybrid(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x5xf32> {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
+  %1 = "stablehlo.dot_general"(%arg0, %0) {
+    dot_dimension_numbers = #stablehlo.dot<
+      lhs_batching_dimensions = [0, 1],
+      rhs_batching_dimensions = [0, 1],
+      lhs_contracting_dimensions = [3],
+      rhs_contracting_dimensions = [2]>,
+      precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
+  } : (tensor<1x2x3x4xf32>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5xf32>
+  return %1 : tensor<1x2x3x5xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>}
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x4x5xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG0]], %[[DQ]], batching_dims = [0, 1] x [0, 1], contracting_dims = [3] x [2], precision = [DEFAULT, DEFAULT] : (tensor<1x2x3x4xf32>, tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Tests that a hybrid quantized convolution is splitted into dequantize and
+// float convolution.
+
+// CHECK-LABEL: func @convolution_hybrid
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x3x4xf32>
+func.func @convolution_hybrid(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x2xf32> {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2xf32>
+  return %1 : tensor<1x3x3x2xf32>
+}
+
+// CHECK: %[[WEIGHT:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<3x3x4x2xi8>}
+// CHECK: %[[DQ:.+]] = "tfl.dequantize"(%[[WEIGHT]]) : (tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+02,3.000000e+03}>>) -> tensor<3x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG0]], %[[DQ]])
+// CHECK{LITERAL}: dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64}
+// CHECK-SAME: (tensor<1x3x3x4xf32>, tensor<3x3x4x2xf32>) -> tensor<1x3x3x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
new file mode 100644
index 00000000000000..0dc354f998d246
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.cc
@@ -0,0 +1,79 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace odml {
+
+namespace {
+
+// This file is generated from `passes.td` and provides the implementation base
+// class.
+#define GEN_PASS_DEF_COMPOSITELOWERINGPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+class CompositeLoweringPass
+    : public impl::CompositeLoweringPassBase<CompositeLoweringPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(CompositeLoweringPass);
+
+  void runOnOperation() override;
+};
+
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/generated_composite_lowering.inc"
+
+void CompositeLoweringPass::runOnOperation() {
+  MLIRContext& context = getContext();
+  RewritePatternSet patterns(&getContext());
+
+  populateWithGenerated(patterns);
+
+  ConversionTarget target(context);
+  target.addLegalDialect<TFL::TensorFlowLiteDialect>();
+
+  if (failed(applyPartialConversion(getOperation(), target,
+                                    std::move(patterns)))) {
+    getOperation().emitError("Composite lowering pass failed.");
+    signalPassFailure();
+  }
+}
+
+}  // namespace
+
+// Creates an instance of the pass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCompositeLoweringPass() {
+  return std::make_unique<CompositeLoweringPass>();
+}
+
+// Registers the pass implementation
+static PassRegistration<CompositeLoweringPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
similarity index 57%
rename from third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
rename to tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
index 2ac358b4ee56c5..0bb758ad9f154b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_pass.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
 
-namespace stream_executor::gpu {
-enum struct GpuSemaphoreState { Hold, Release, TimedOut };
-namespace delay_kernel {
-void* kernel();  // returns a pointer to a CUDA C++ device function
-}  // namespace delay_kernel
-}  // namespace stream_executor::gpu
+namespace mlir {
+namespace odml {
 
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TIMER_KERNEL_H_
+std::unique_ptr<Pass> CreateCompositeLoweringPass();
+
+}  // namespace odml
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_COMPOSITE_LOWERING_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
new file mode 100644
index 00000000000000..1b62b6fcc4aeae
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/composite_lowering_patterns.td
@@ -0,0 +1,28 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Pattern definition file for direct lowering of mhlo composites to tflite ops.
+
+include "mlir/IR/OpBase.td"
+include "mlir/Dialect/Func/IR/FuncOps.td"
+include "mhlo/IR/hlo_ops.td"
+include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
+
+
+def LegalizeHardSwishComposite: Pat<
+                    (MHLO_CompositeOp:$old_value
+                    (variadic $input),
+                    ConstantStrAttr<StrAttr, "aten.hardswish.default">, $_, $_, $_),
+                    (TFL_HardSwishOp $input)>;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
index 066bc83ad90217..847738e5cc7cbe 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/fold_broadcast_pass.cc
@@ -177,8 +177,7 @@ class FoldBroadcastInDimBeforeBinaryElementwiseOp
     // When the operand other than the broadcast op is not a const op, we
     // should not fold broadcast op.
     auto binary_op_const_operand =
-        lhs_bcast_op ? rhs.template getDefiningOp<mhlo::ConstantOp>()
-                     : lhs.template getDefiningOp<mhlo::ConstantOp>();
+        (lhs_bcast_op ? rhs : lhs).template getDefiningOp<mhlo::ConstantOp>();
     if (!binary_op_const_operand) return failure();
     auto bcast_op = lhs_bcast_op ? lhs_bcast_op : rhs_bcast_op;
     auto const_op =
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
new file mode 100644
index 00000000000000..a35f5ba324e3f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_composite_to_tfl_custom.cc
@@ -0,0 +1,137 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+
+#define DEBUG_TYPE "composite-to-custom"
+
+namespace mlir {
+namespace odml {
+
+#define GEN_PASS_DEF_LEGALIZECOMPOSITETOCUSTOMOPPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+namespace {
+bool IsSupportedComposite(::mlir::stablehlo::CompositeOp op) {
+  // List of supported composites to represent using CustomOp.
+  return llvm::is_contained(
+      {"odml.update_kv_cache", "odml.scaled_dot_product_attention"},
+      op.getName());
+}
+
+TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
+                                 const std::string& content) {
+  return TFL::ConstBytesAttr::get(builder->getContext(),
+                                  StringRef(content.data(), content.size()));
+}
+
+LogicalResult BuildOption(flexbuffers::Builder* fbb, Operation* op,
+                          NamedAttribute pair) {
+  const char* key = pair.getName().data();
+  const auto attr = pair.getValue();
+
+  if (attr.isa<::mlir::IntegerAttr>()) {
+    fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
+    return success();
+  }
+
+  if (attr.isa<::mlir::FloatAttr>()) {
+    fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
+    return success();
+  }
+
+  return op->emitWarning("serialization not supported for : ") << key;
+}
+
+TFL::CustomOp BuildCustomOp(stablehlo::CompositeOp composite,
+                            const std::string& custom_option_buffer) {
+  OpBuilder builder(composite->getContext());
+  builder.setInsertionPoint(composite);
+  return builder.create<TFL::CustomOp>(
+      composite->getLoc(), composite->getResultTypes(),
+      composite->getOperands(), composite.getName(),
+      CustomOption(&builder, custom_option_buffer));
+}
+
+}  // namespace
+
+// Legalize stablehlo::CompositeOp to TFL::CustomOp for runtime-supported
+// composites. See `IsSupportedComposite` for list of supported ops.
+//
+// Example:
+//   %0 = stablehlo.composite "odml.some_op" <args> {
+//      composite_attrs = {<attrs>},
+//      version = 0 : i32
+//   }
+//   ==>
+//   %0 = tfl.custom(<args>) {
+//     custom_code = "odml.some_op",
+//     custom_option = #tfl<const_bytes : "flexbuffer_serialized_attrs">
+//   }
+struct LegalizeCompositeToCustomOpPass
+    : public impl::LegalizeCompositeToCustomOpPassBase<
+          LegalizeCompositeToCustomOpPass> {
+  using LegalizeCompositeToCustomOpPassBase::
+      LegalizeCompositeToCustomOpPassBase;
+
+  void runOnOperation() override {
+    func::FuncOp fn = getOperation();
+    fn.walk([&](Operation* op) {
+      // Process only StableHLO composite ops.
+      auto composite = llvm::dyn_cast<stablehlo::CompositeOp>(op);
+      if (!composite || !IsSupportedComposite(composite)) return;
+
+      // Build flexbuffer options.
+      std::string custom_option_buffer;
+      auto fbb = std::make_unique<flexbuffers::Builder>();
+      size_t map_start = fbb->StartMap();
+      for (const NamedAttribute& pair : composite.getCompositeAttributes()) {
+        // Allows skipping unsupported attributes, will warn.
+        (void)BuildOption(fbb.get(), op, pair);
+      }
+      fbb->EndMap(map_start);
+      fbb->Finish();
+      custom_option_buffer.assign(fbb->GetBuffer().begin(),
+                                  fbb->GetBuffer().end());
+
+      // Build TFL custom op, replace composite with custom op.
+      TFL::CustomOp tfl_custom_op =
+          BuildCustomOp(composite, custom_option_buffer);
+      composite->replaceAllUsesWith(tfl_custom_op);
+      composite->erase();
+    });
+  }
+};
+
+static PassRegistration<LegalizeCompositeToCustomOpPass> pass;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
new file mode 100644
index 00000000000000..4cfb0e04e96af4
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
@@ -0,0 +1,110 @@
+/* Copyright 2022 The StableHLO Authors.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
+
+namespace mlir {
+namespace odml {
+
+#define GEN_PASS_DEF_LEGALIZESTABLEHLOCUSTOMCALLTOCOMPOSITEPASS
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
+
+struct ReplaceCustomCallWithComposite final
+    : OpRewritePattern<mlir::stablehlo::CustomCallOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  explicit ReplaceCustomCallWithComposite(MLIRContext *context)
+      : OpRewritePattern<mlir::stablehlo::CustomCallOp>(context) {}
+
+  LogicalResult matchAndRewrite(mlir::stablehlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
+    auto backendConfig =
+        op->getAttr("composite.backend_config").dyn_cast<DictionaryAttr>();
+    if (!backendConfig)
+      return op->emitError(
+          "custom_call has no 'composite.backend_config' attribute or the "
+          "attribute is not a dictionary");
+
+    auto name = backendConfig.get("name").dyn_cast<StringAttr>();
+    if (!name)
+      return op->emitError(
+          "backend_config has no 'name' key or the name value is not a string");
+
+    auto attrs = backendConfig.get("attributes").dyn_cast<DictionaryAttr>();
+    if (!attrs)
+      return op->emitError(
+          "backend_config has no 'attributes' key or the attributes value is "
+          "not a dictionary");
+
+    auto calledComputations = op.getCalledComputations();
+    if (!calledComputations || calledComputations.size() != 1)
+      return op->emitError("expected exactly one called_computation");
+
+    auto decomposition = calledComputations[0].cast<FlatSymbolRefAttr>();
+
+    auto composite = rewriter.create<mlir::stablehlo::CompositeOp>(
+        op.getLoc(), op.getResultTypes(), op.getOperands(), name.str(), attrs,
+        decomposition.getValue());
+    rewriter.replaceOp(op, composite.getResults());
+    return success();
+  }
+};
+
+struct LegalizeStablehloCustomCallToCompositePass
+    : public impl::LegalizeStablehloCustomCallToCompositePassBase<
+          LegalizeStablehloCustomCallToCompositePass> {
+  using LegalizeStablehloCustomCallToCompositePassBase::
+      LegalizeStablehloCustomCallToCompositePassBase;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+
+    ConversionTarget target(*context);
+    target.addLegalDialect<mlir::stablehlo::StablehloDialect>();
+    target.addLegalDialect<mlir::func::FuncDialect>();
+    target.addDynamicallyLegalOp<mlir::stablehlo::CustomCallOp>(
+        [&](mlir::stablehlo::CustomCallOp op) {
+          return op.getCallTargetName() != "stablehlo.composite";
+        });
+
+    RewritePatternSet patterns(context);
+    patterns.add<ReplaceCustomCallWithComposite>(context);
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+static PassRegistration<LegalizeStablehloCustomCallToCompositePass>
+    pass_shlo_sc2c;
+
+}  // namespace odml
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
index 8df2d3503f3632..49e8b673f63374 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -59,6 +59,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateUnfoldSplatConstantPass();
 // Create a pass that legalizes MHLO to TFLite dialect.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfLitePass();
 
+// Creates a pass that lowers stablehlo composite ops to tflite ops.
+std::unique_ptr<OperationPass<ModuleOp>> CreateCompositeLoweringPass();
+
 // Adds the HLO to TF rewrite patterns to the specified pattern list.
 void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
                                      MLIRContext* context);
@@ -67,8 +70,7 @@ void PopulateLegalizeHloToTfPatterns(RewritePatternSet* patterns,
 void PopulateLegalizeHloToTFLitePatterns(RewritePatternSet* patterns,
                                          MLIRContext* context);
 
-#define GEN_PASS_DECL_LEGALIZESTABLEHLOTOVHLOPASS
-#define GEN_PASS_DECL_LEGALIZEVHLOTOSTABLEHLOPASS
+#define GEN_PASS_DECL
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index 002990601a9efb..a535d3aa867c80 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -95,8 +95,22 @@ def LegalizeVhloToStablehloPass : Pass<"vhlo-legalize-stablehlo", "ModuleOp"> {
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
+def LegalizeCompositeToCustomOpPass : Pass<"stablehlo-composite-legalize-tfl-custom", "func::FuncOp"> {
+  let summary = "Legalize supported StableHLO CompositeOps to TFL CustomOp";
+  let dependentDialects = ["TFL::TensorFlowLiteDialect"];
+}
+def LegalizeStablehloCustomCallToCompositePass : Pass<"stablehlo-custom-call-legalize-composite", "ModuleOp"> {
+  let summary = "Legalize StableHLO custom call ops where the call target is 'stablehlo.composite' to composite ops.";
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
 
 def UnfoldSplatConstantPass : Pass<"unfold-splat-constant-pass", "ModuleOp"> {
   let summary = "Replaces a splat constant tensor with a BroadcastInDim op.";
   let constructor = "mlir::odml::CreateUnfoldSplatConstantPass()";
 }
+
+def CompositeLoweringPass : Pass<"composite-lowering", "ModuleOp"> {
+  let summary = "Lowers mhlo composites directly to tflite ops (when possible).";
+  let dependentDialects = ["mlir::mhlo::MhloDialect", "TFL::TensorFlowLiteDialect"];
+  let constructor = "mlir::odml::CreateCompositeLoweringPass()";
+}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
index 033ec78751e6b6..06754ea72b580c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
@@ -70,6 +71,9 @@ class SmuggleDisallowedOpsPass
   StringRef getDescription() const final {
     return "Smuggle disallowed ops via stablehlo.custom_calls";
   }
+  void getDependentDialects(DialectRegistry& registry) const final {
+    registry.insert<mlir::stablehlo::StablehloDialect>();
+  }
 
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
@@ -77,7 +81,7 @@ class SmuggleDisallowedOpsPass
     patterns.add<SmuggleOpPattern<TF::ResizeNearestNeighborOp>>(&getContext());
 
     ConversionTarget target(getContext());
-    target.addIllegalDialect<TF::TensorFlowDialect>();
+    target.addIllegalOp<TF::ResizeBilinearOp, TF::ResizeNearestNeighborOp>();
     target.addLegalDialect<mlir::stablehlo::StablehloDialect>();
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns)))) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
deleted file mode 100644
index b120ca89c290d4..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Diagnostics.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-
-namespace mlir {
-namespace odml {
-
-class StablehloToTflPass
-    : public mlir::PassWrapper<StablehloToTflPass,
-                               mlir::OperationPass<mlir::func::FuncOp>> {
- public:
-  explicit StablehloToTflPass() : PassWrapper() {}
-  StringRef getArgument() const final { return "stablehlo-tfl"; }
-  StringRef getDescription() const final {
-    return "This pass will legalize StableHLO Ops to TFLite custom Ops.";
-  }
-
- private:
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<TFL::TensorFlowLiteDialect>();
-  }
-  inline TFL::ConstBytesAttr CustomOption(OpBuilder* builder,
-                                          const std::string& content) {
-    return TFL::ConstBytesAttr::get(builder->getContext(),
-                                    StringRef(content.data(), content.size()));
-  }
-
-  void AddIntegerArray(flexbuffers::Builder* fbb,
-                       ::llvm::ArrayRef<int64_t> vec) {
-    auto start_input_dim = fbb->StartVector();
-    for (auto int_value : vec) {
-      fbb->Add(int_value);
-    }
-    fbb->EndVector(start_input_dim, /*typed=*/false, /*fixed=*/false);
-  }
-};
-
-void StablehloToTflPass::runOnOperation() {
-  func::FuncOp fn = getOperation();
-  OpBuilder builder(fn.getContext());
-  fn.walk([&](Operation* op) {
-    // Process only StableHLO ops.
-    if (op->getDialect()->getNamespace() != "stablehlo") return;
-
-    // Build options.
-    std::string custom_option_buffer;
-    auto fbb = std::make_unique<flexbuffers::Builder>();
-    size_t map_start = fbb->StartMap();
-    for (auto pair : op->getAttrDictionary().getValue()) {
-      const char* key = pair.getName().data();
-      const auto attr = pair.getValue();
-
-      if (attr.isa<::mlir::IntegerAttr>()) {
-        fbb->Int(key, attr.dyn_cast<mlir::IntegerAttr>().getInt());
-        continue;
-      }
-
-      if (attr.isa<::mlir::FloatAttr>()) {
-        fbb->Double(key, attr.dyn_cast<mlir::FloatAttr>().getValueAsDouble());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ElementsAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ElementsAttr>();
-        const auto ftype = array_attr.getElementType();
-        if (ftype.isInteger(16) || ftype.isInteger(32) || ftype.isInteger(64) ||
-            ftype.isInteger(128) || ftype.isInteger(1)) {
-          for (auto value : array_attr.getValues<IntegerAttr>()) {
-            auto int_value =
-                value.dyn_cast_or_null<mlir::IntegerAttr>().getInt();
-            fbb->Add(int_value);
-          }
-        } else if (ftype.isF32() || ftype.isF64() || ftype.isF128()) {
-          for (auto value : array_attr.getValues<FloatAttr>()) {
-            auto double_value =
-                value.dyn_cast_or_null<mlir::FloatAttr>().getValueAsDouble();
-            fbb->Add(double_value);
-          }
-        } else {
-          emitWarning(op->getLoc(), "serialization of ElementsAttr for ")
-              << key << " only supports Integer and Float.";
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::DenseI64ArrayAttr>()) {
-        auto array_attr = attr.dyn_cast<mlir::DenseI64ArrayAttr>();
-        auto start = fbb->StartVector(key);
-        for (auto int_value : array_attr.asArrayRef()) {
-          fbb->Add(int_value);
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::DenseBoolArrayAttr>()) {
-        auto array_attr = attr.dyn_cast<mlir::DenseBoolArrayAttr>();
-        auto start = fbb->StartVector(key);
-        for (auto bool_value : array_attr.asArrayRef()) {
-          fbb->Add(bool_value);
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::StringAttr>()) {
-        fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
-        continue;
-      }
-
-      if (attr.isa<::mlir::ArrayAttr>()) {
-        auto start = fbb->StartVector(key);
-        auto array_attr = attr.dyn_cast<mlir::ArrayAttr>();
-        if (array_attr.size() > 1 && !array_attr[0].isa<mlir::StringAttr>() &&
-            !array_attr[0].isa<mlir::stablehlo::PrecisionAttr>()) {
-          emitWarning(op->getLoc(), "serialization of ArrayAttr for ")
-              << key << " only supports Strings.";
-          continue;
-        }
-        for (auto value : array_attr) {
-          if (value.isa<mlir::stablehlo::PrecisionAttr>()) {
-            auto string_value =
-                mlir::stablehlo::stringifyPrecision(
-                    value.cast<mlir::stablehlo::PrecisionAttr>().getValue())
-                    .data();
-            fbb->Add(string_value);
-          } else {
-            auto string_value =
-                value.dyn_cast_or_null<mlir::StringAttr>().data();
-            fbb->Add(string_value);
-          }
-        }
-        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ConvDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::ConvDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        fbb->Add(dimension_attr.getInputBatchDimension());
-        fbb->Add(dimension_attr.getInputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getInputSpatialDimensions());
-        fbb->Add(dimension_attr.getKernelInputFeatureDimension());
-        fbb->Add(dimension_attr.getKernelOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getKernelSpatialDimensions());
-        fbb->Add(dimension_attr.getOutputBatchDimension());
-        fbb->Add(dimension_attr.getOutputFeatureDimension());
-        AddIntegerArray(fbb.get(), dimension_attr.getOutputSpatialDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::GatherDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::GatherDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getOffsetDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getCollapsedSliceDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getStartIndexMap());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ScatterDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::ScatterDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getUpdateWindowDims());
-        AddIntegerArray(fbb.get(), dimension_attr.getInsertedWindowDims());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getScatterDimsToOperandDims());
-        fbb->Add(dimension_attr.getIndexVectorDim());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::DotDimensionNumbersAttr>()) {
-        auto dimension_attr =
-            attr.dyn_cast<::mlir::stablehlo::DotDimensionNumbersAttr>();
-        auto start = fbb->StartVector(key);
-        AddIntegerArray(fbb.get(), dimension_attr.getLhsBatchingDimensions());
-        AddIntegerArray(fbb.get(), dimension_attr.getRhsBatchingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getLhsContractingDimensions());
-        AddIntegerArray(fbb.get(),
-                        dimension_attr.getRhsContractingDimensions());
-        fbb->EndVector(start, /*typed=*/false, /*fixed=*/false);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ComparisonDirectionAttr>()) {
-        auto string_value =
-            mlir::stablehlo::stringifyComparisonDirection(
-                attr.cast<mlir::stablehlo::ComparisonDirectionAttr>()
-                    .getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      if (attr.isa<::mlir::stablehlo::ComparisonTypeAttr>()) {
-        auto string_value =
-            mlir::stablehlo::stringifyComparisonType(
-                attr.cast<mlir::stablehlo::ComparisonTypeAttr>().getValue())
-                .str();
-        fbb->String(key, string_value);
-        continue;
-      }
-
-      // default
-      emitWarning(op->getLoc(), "serialization not supported for : ") << key;
-    }
-    fbb->EndMap(map_start);
-    fbb->Finish();
-    custom_option_buffer.assign(fbb->GetBuffer().begin(),
-                                fbb->GetBuffer().end());
-
-    // Build custom op.
-    builder.setInsertionPoint(op);
-    auto tfl_custom_op = builder.create<TFL::CustomOp>(
-        op->getLoc(), op->getResultTypes(), op->getOperands(),
-        op->getName().getStringRef(),
-        CustomOption(&builder, custom_option_buffer));
-    op->replaceAllUsesWith(tfl_custom_op);
-    op->erase();
-  });
-}
-std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass() {
-  return std::make_unique<StablehloToTflPass>();
-}
-
-static PassRegistration<StablehloToTflPass> pass;
-
-}  // namespace odml
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
deleted file mode 100644
index 9445b770f10562..00000000000000
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
-
-#include <memory>
-#include <string>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-
-namespace mlir {
-namespace odml {
-
-// Creates a pass which transforms StableHLO Ops to TFL Ops.
-std::unique_ptr<OperationPass<func::FuncOp>> CreateStablehloToTflPass();
-
-}  // namespace odml
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_STABLEHLO_TRANSFORMS_STABLEHLO_TFL_PASS_H_
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
index e0a1dbc3bf9445..fcacfcf4984db1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 
 namespace mlir {
 namespace odml {
@@ -104,8 +105,8 @@ void TFToMhloPass::runOnOperation() {
   mhlo::Tf2XlaTypeConverter converter;
   mhlo::PopulateLegalizeTfWithTf2XlaPatterns(
       "XLA_CPU_JIT", patterns, context, converter, /*prefer_tf2xla=*/false);
-  chlo::populateDecomposeChloPatterns(context, &patterns);
-  chlo::populateChloBroadcastingPatterns(context, &patterns);
+  stablehlo::StablehloToHloTypeConverter hlo_converter;
+  chlo::populateChloToHloPatterns(context, &hlo_converter, &patterns);
   chlo::ConstantLikeOp::getCanonicalizationPatterns(patterns, context);
 
   ConversionTarget target(*context);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index aaa1236a6d9470..8fed8f3f01ed54 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -62,6 +62,7 @@ using ::mlir::quant::CreateI32F32UniformQuantizedType;
 using ::mlir::quant::CreateI8F32UniformQuantizedPerAxisType;
 using ::mlir::quant::CreateI8F32UniformQuantizedType;
 using ::mlir::quant::FindUserOfType;
+using ::mlir::quant::GetElementType;
 using ::mlir::quant::IsI32F32UniformQuantizedPerAxisType;
 using ::mlir::quant::IsI32F32UniformQuantizedType;
 using ::mlir::quant::IsI8F32UniformQuantizedPerAxisType;
@@ -142,10 +143,7 @@ TFL::QConstOp CreateTransposedTflConstOpForFilter(
   Type new_filter_quantized_type;
 
   if (is_per_channel) {
-    auto filter_quantized_type = filter_constant_op.getResult()
-                                     .getType()
-                                     .cast<TensorType>()
-                                     .getElementType()
+    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
                                      .cast<UniformQuantizedPerAxisType>();
     new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
@@ -153,10 +151,7 @@ TFL::QConstOp CreateTransposedTflConstOpForFilter(
         filter_quantized_type.getZeroPoints(),
         /*quantization_dimension=*/0, /*narrow_range=*/true);
   } else {
-    auto filter_quantized_type = filter_constant_op.getResult()
-                                     .getType()
-                                     .cast<TensorType>()
-                                     .getElementType()
+    auto filter_quantized_type = GetElementType(filter_constant_op.getResult())
                                      .cast<UniformQuantizedType>();
     new_filter_quantized_type = CreateI8F32UniformQuantizedType(
         filter_constant_op->getLoc(), *rewriter.getContext(),
@@ -224,9 +219,7 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
   Type bias_quantized_type;
   if (is_per_channel) {
     const auto filter_quantized_element_type =
-        filter_const_op.getResult()
-            .getType()
-            .getElementType()
+        GetElementType(filter_const_op.getResult())
             .cast<UniformQuantizedPerAxisType>();
 
     // The storage type is i32 for bias, which is the precision used for
@@ -238,9 +231,7 @@ TFL::QConstOp CreateTflConstOpForDummyBias(
         /*quantization_dimension=*/0);
   } else {
     const auto filter_quantized_element_type =
-        filter_const_op.getResult()
-            .getType()
-            .getElementType()
+        GetElementType(filter_const_op.getResult())
             .cast<UniformQuantizedType>();
 
     // The storage type is i32 for bias, which is the precision used for
@@ -277,8 +268,9 @@ arith::ConstantOp CreateI32ShapeConstantOp(const TensorType op_type,
 }
 
 // Returns the desired qi8 per-tensor quantized output type for a given gemm op.
-Type GetOutputType(Operation* op, MLIRContext& ctx, const bool has_i32_output,
-                   const bool fuse_bias_constant) {
+Type GetQuantizedOutputType(Operation* op, PatternRewriter& rewriter,
+                            const bool has_i32_output,
+                            const bool fuse_bias_constant) {
   Operation* uniform_quantize_op;
   if (!has_i32_output) return op->getResult(0).getType();
   if (fuse_bias_constant) {
@@ -289,17 +281,15 @@ Type GetOutputType(Operation* op, MLIRContext& ctx, const bool has_i32_output,
   }
   // StableHLO Quantizer outputs an i32 type. Rewrite to i8 type result
   // to meet TFLite op requirement.
-  auto result_quantized_type = uniform_quantize_op->getResult(0)
-                                   .getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
+  auto result_quantized_type = GetElementType(uniform_quantize_op->getResult(0))
                                    .cast<UniformQuantizedType>();
   auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
-      uniform_quantize_op->getLoc(), ctx, result_quantized_type.getScale(),
-      result_quantized_type.getZeroPoint());
+      uniform_quantize_op->getLoc(), *rewriter.getContext(),
+      result_quantized_type.getScale(), result_quantized_type.getZeroPoint());
   // Omit any bias and requantize ops as `tfl.{gemm_op}` outputs a
   // fused `qi8` type.
-  FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
+  rewriter.replaceAllUsesWith(uniform_quantize_op->getResult(0),
+                              op->getResult(0));
   return op->getResult(0).getType().cast<TensorType>().clone(
       new_result_quantized_type);
 }
@@ -315,8 +305,7 @@ class RewriteUniformQuantizeOp
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/quantize.cc#L105).
   LogicalResult match(stablehlo::UniformQuantizeOp op) const override {
-    const Type input_element_type =
-        op.getOperand().getType().cast<TensorType>().getElementType();
+    const Type input_element_type = GetElementType(op.getOperand());
     if (!(input_element_type.isa<FloatType>() ||
           IsI32F32UniformQuantizedType(input_element_type) ||
           IsI32F32UniformQuantizedPerAxisType(input_element_type))) {
@@ -328,10 +317,7 @@ class RewriteUniformQuantizeOp
 
     // Output type of `UniformQuantizeOp` is guaranteed to be a quantized
     // tensor with integer storage type.
-    const auto output_storage_type = op.getResult()
-                                         .getType()
-                                         .cast<TensorType>()
-                                         .getElementType()
+    const auto output_storage_type = GetElementType(op.getResult())
                                          .cast<QuantizedType>()
                                          .getStorageType()
                                          .cast<IntegerType>();
@@ -363,10 +349,7 @@ class RewriteUniformDequantizeOp
   // detailed limitations
   // (https://github.com/tensorflow/tensorflow/blob/8f145d579aa0ee7f4187af32dbbf4e12fdabbffe/tensorflow/lite/kernels/dequantize.cc#L52).
   LogicalResult match(stablehlo::UniformDequantizeOp op) const override {
-    const auto input_storage_type = op.getOperand()
-                                        .getType()
-                                        .cast<TensorType>()
-                                        .getElementType()
+    const auto input_storage_type = GetElementType(op.getOperand())
                                         .cast<QuantizedType>()
                                         .getStorageType()
                                         .cast<IntegerType>();
@@ -377,11 +360,8 @@ class RewriteUniformDequantizeOp
     }
 
     // Output type is guaranteed to be a float tensor for a valid StableHLO.
-    const auto output_element_type = op.getResult()
-                                         .getType()
-                                         .cast<TensorType>()
-                                         .getElementType()
-                                         .cast<FloatType>();
+    const auto output_element_type =
+        GetElementType(op.getResult()).cast<FloatType>();
     if (!output_element_type.isa<Float32Type>()) {
       LLVM_DEBUG(llvm::dbgs() << "Uniform dequantize op's output element type "
                                  "should be f32. Got: "
@@ -448,8 +428,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         op.getDotDimensionNumbers();
     const bool is_batch_matmul =
         !dot_dimension_nums.getLhsBatchingDimensions().empty();
-    const Type elem_type =
-        op.getResult().getType().cast<TensorType>().getElementType();
+    const Type elem_type = GetElementType(op.getResult());
     const bool has_i32_output = IsI32F32UniformQuantizedType(elem_type) ||
                                 IsI32F32UniformQuantizedPerAxisType(elem_type);
 
@@ -479,8 +458,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
 
   void rewrite(stablehlo::DotGeneralOp op,
                PatternRewriter& rewriter) const override {
-    const Type output_type =
-        op.getResult().getType().cast<TensorType>().getElementType();
+    const Type output_type = GetElementType(op.getResult());
     const bool has_i32_output =
         IsI32F32UniformQuantizedType(output_type) ||
         IsI32F32UniformQuantizedPerAxisType(output_type);
@@ -656,8 +634,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
   static LogicalResult MatchOutput(const Value output,
                                    const bool has_i32_output,
                                    const bool is_batch_matmul) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
+    const Type output_element_type = GetElementType(output);
     if (has_i32_output) {
       if (is_batch_matmul &&
           !IsI32F32UniformQuantizedType(output_element_type)) {
@@ -760,11 +737,8 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
     TFL::QConstOp filter_constant_op = CreateTflConstOpForFilter(
         rhs_value.getDefiningOp(), rewriter, /*is_per_channel=*/true);
 
-    const double input_scale = lhs_value.getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
-                                   .cast<UniformQuantizedType>()
-                                   .getScale();
+    const double input_scale =
+        GetElementType(lhs_value).cast<UniformQuantizedType>().getScale();
     TFL::QConstOp bias_tfl_op;
     bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
@@ -800,16 +774,10 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(add_op);
         const auto filter_quantized_type =
-            op->getOperand(1)
-                .getType()
-                .cast<TensorType>()
-                .getElementType()
+            GetElementType(op->getOperand(1))
                 .cast<UniformQuantizedPerAxisType>();
         const SmallVector<double> bias_scales = GetBiasScales(
-            /*input_scale=*/op->getOperand(0)
-                .getType()
-                .cast<TensorType>()
-                .getElementType()
+            /*input_scale=*/GetElementType(op->getOperand(0))
                 .cast<UniformQuantizedType>()
                 .getScale(),
             /*filter_scales=*/filter_quantized_type.getScales());
@@ -821,10 +789,7 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         const auto bias_quantized_type =
             CreateI32F32UniformQuantizedPerAxisType(
                 op->getLoc(), *op->getContext(), std::move(bias_scales),
-                op->getResult(0)
-                    .getType()
-                    .cast<TensorType>()
-                    .getElementType()
+                GetElementType(op->getResult(0))
                     .cast<UniformQuantizedPerAxisType>()
                     .getZeroPoints(),
                 /*quantization_dimension=*/0);
@@ -841,11 +806,9 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
         uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(op);
       }
 
-      const auto result_quantized_type = uniform_quantize_op->getResult(0)
-                                             .getType()
-                                             .cast<TensorType>()
-                                             .getElementType()
-                                             .cast<UniformQuantizedType>();
+      const auto result_quantized_type =
+          GetElementType(uniform_quantize_op->getResult(0))
+              .cast<UniformQuantizedType>();
       const auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
           uniform_quantize_op->getLoc(), *rewriter.getContext(),
           result_quantized_type.getScale(),
@@ -856,8 +819,8 @@ class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
       // fused `qi8` type.
       FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
     } else {
-      output_type = GetOutputType(op, *rewriter.getContext(), has_i32_output,
-                                  fuse_bias_constant);
+      output_type = GetQuantizedOutputType(op, rewriter, has_i32_output,
+                                           fuse_bias_constant);
     }
     return output_type;
   }
@@ -898,8 +861,8 @@ class RewriteQuantizedConvolutionOp
  public:
   using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
   LogicalResult match(stablehlo::ConvolutionOp op) const override {
-    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
-        op.getResult().getType().cast<TensorType>().getElementType());
+    const bool has_i32_output =
+        IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     const bool fuse_bias_constant =
         FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
@@ -965,8 +928,8 @@ class RewriteQuantizedConvolutionOp
 
   void rewrite(stablehlo::ConvolutionOp op,
                PatternRewriter& rewriter) const override {
-    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
-        op.getResult().getType().cast<TensorType>().getElementType());
+    const bool has_i32_output =
+        IsI32F32UniformQuantizedPerAxisType(GetElementType(op.getResult()));
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
         op.getDimensionNumbers();
 
@@ -993,8 +956,8 @@ class RewriteQuantizedConvolutionOp
       input_value = pad_op.getResult();
     }
 
-    const Type output_type = GetOutputType(op, *rewriter.getContext(),
-                                           has_i32_output, fuse_bias_constant);
+    const Type output_type = GetQuantizedOutputType(
+        op, rewriter, has_i32_output, fuse_bias_constant);
     const auto [stride_h, stride_w] = GetStrides(op);
     const auto [dilation_h_factor, dilation_w_factor] = GetDilationFactors(op);
     if (is_depthwise) {
@@ -1110,8 +1073,7 @@ class RewriteQuantizedConvolutionOp
   }
 
   static LogicalResult MatchOutput(Value output) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
+    const Type output_element_type = GetElementType(output);
     if (!IsI32F32UniformQuantizedPerAxisType(output_element_type) &&
         !IsI8F32UniformQuantizedType(output_element_type)) {
       LLVM_DEBUG(
@@ -1290,7 +1252,7 @@ class RewriteQuantizedConvolutionOp
     // output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
     auto get_output_dim_for_same_padding = [](int64_t input_dim,
                                               int64_t stride_dim) -> int64_t {
-      return std::ceil(input_dim / stride_dim);
+      return std::ceil(input_dim / static_cast<double>(stride_dim));
     };
     return output_height ==
                get_output_dim_for_same_padding(input_height, stride_height) &&
@@ -1397,10 +1359,7 @@ class RewriteQuantizedConvolutionOp
     Value filter_value = op.getOperand(1);
     Operation* filter_op = filter_value.getDefiningOp();
     auto filter_uniform_quantized_type =
-        filter_value.getType()
-            .cast<TensorType>()
-            .getElementType()
-            .cast<UniformQuantizedPerAxisType>();
+        GetElementType(filter_value).cast<UniformQuantizedPerAxisType>();
     auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
         cast<stablehlo::ConstantOp>(filter_value.getDefiningOp()).getValue());
     const DenseIntElementsAttr new_filter_value_attr =
@@ -1440,10 +1399,7 @@ class RewriteQuantizedConvolutionOp
       const SmallVector<int64_t, 1> bias_shape, const bool has_i32_output,
       const bool fuse_bias_constant) const {
     const SmallVector<double> bias_scales = GetBiasScales(
-        /*input_scale=*/op.getOperand(0)
-            .getType()
-            .cast<TensorType>()
-            .getElementType()
+        /*input_scale=*/GetElementType(op.getOperand(0))
             .cast<UniformQuantizedType>()
             .getScale(),
         /*filter_scales=*/new_filter_quantized_type.getScales());
@@ -2032,20 +1988,186 @@ class RewriteQuantizedGatherOp : public OpRewritePattern<stablehlo::GatherOp> {
   }
 };
 
+// Rewrites quantized stablehlo.dynamic_slice to tfl.slice.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedDynamicSliceOp
+    : public OpRewritePattern<stablehlo::DynamicSliceOp> {
+ public:
+  using OpRewritePattern<stablehlo::DynamicSliceOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::DynamicSliceOp op) const override {
+    if (!IsQuantizedTensorType(op.getOperand().getType()) ||
+        !IsQuantizedTensorType(op.getResult().getType())) {
+      return failure();
+    }
+
+    return success(quant::HasStaticShape(op.getOperand()));
+  }
+
+  void rewrite(stablehlo::DynamicSliceOp op,
+               PatternRewriter& rewriter) const override {
+    Type output = op.getResult().getType();
+    Value input = op.getOperand();
+    TensorType operand_type = input.getType().cast<TensorType>();
+    ArrayRef<int64_t> operand_shape = operand_type.getShape();
+    const int64_t rank = operand_type.getRank();
+    const Type i64_type = rewriter.getI64Type();
+
+    ArrayRef<int64_t> slice_sizes = op.getSliceSizes();
+    TensorType single_element_type =
+        operand_type.cloneWith({static_cast<int64_t>(1)}, i64_type);
+
+    SmallVector<Value> start_indices(rank);
+    for (auto [i, start_index] : llvm::enumerate(op.getStartIndices())) {
+      // Start indices should be casted from tensor<i64> to tensor<1xi64>.
+      auto cast = rewriter.create<TFL::BitcastOp>(
+          op->getLoc(), single_element_type, start_index);
+      int64_t upper_limit_idx = operand_shape[i] - slice_sizes[i];
+      auto upper_limit_attr =
+          DenseIntElementsAttr::get(single_element_type, {upper_limit_idx});
+      auto upper_limit_cst =
+          rewriter.create<arith::ConstantOp>(op->getLoc(), upper_limit_attr);
+      // Dynamic start indices should be clamped with upper limit of
+      // `shape(operand) - slice_sizes)` as per semantics of
+      // `stablehlo.dynamic_slice`.
+      // (https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_slice)
+      start_indices[i] =
+          rewriter.create<TFL::MinimumOp>(op->getLoc(), cast, upper_limit_cst);
+    }
+
+    Value concatenated = start_indices[0];
+    if (rank > 1) {
+      SmallVector<int64_t> begin_shape{rank};
+      Type begin_type = operand_type.cloneWith(begin_shape, i64_type);
+      concatenated = rewriter.create<TFL::ConcatenationOp>(
+          op->getLoc(), begin_type, start_indices, /*axis=*/0,
+          /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+    }
+
+    // Clamp with lower limit.
+    auto lower_limit_attr = DenseIntElementsAttr::get(
+        single_element_type, {static_cast<int64_t>(0)});
+    auto lower_limit_cst =
+        rewriter.create<arith::ConstantOp>(op->getLoc(), lower_limit_attr);
+    // Dynamic start indices should be clamped with lower limit of
+    // 0 as per semantics of `stablehlo.dynamic_slice`.
+    // (https://github.com/openxla/stablehlo/blob/main/docs/spec.md#dynamic_slice)
+    auto begin = rewriter.create<TFL::MaximumOp>(op->getLoc(), concatenated,
+                                                 lower_limit_cst);
+
+    SmallVector<int64_t> size_len{rank};
+    TensorType size_type = operand_type.cloneWith(size_len, i64_type);
+    auto size_attr = DenseIntElementsAttr::get(size_type, slice_sizes);
+    auto size = rewriter.create<arith::ConstantOp>(op.getLoc(), size_attr);
+
+    rewriter.replaceOpWithNewOp<TFL::SliceOp>(op, output, input, begin, size);
+  }
+};
+
+class RewriteQuantizedAddOp : public OpRewritePattern<stablehlo::AddOp> {
+ public:
+  using OpRewritePattern<stablehlo::AddOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::AddOp op) const override {
+    return success(IsI8F32UniformQuantizedType(GetElementType(op.getLhs())) &&
+                   IsI8F32UniformQuantizedType(GetElementType(op.getRhs())));
+  }
+
+  void rewrite(stablehlo::AddOp op, PatternRewriter& rewriter) const override {
+    TFL::QConstOp lhs_qconst_op;
+    TFL::QConstOp rhs_qconst_op;
+
+    auto GetBroadcastedConstOp = [&](Value operand) -> TFL::QConstOp {
+      if (auto broadcast_op = dyn_cast_or_null<stablehlo::BroadcastInDimOp>(
+              operand.getDefiningOp())) {
+        auto stablehlo_const_op = dyn_cast_or_null<stablehlo::ConstantOp>(
+            broadcast_op.getOperand().getDefiningOp());
+        auto const_uniform_quantized_type =
+            stablehlo_const_op.getResult().getType().cast<ShapedType>();
+        return rewriter.create<TFL::QConstOp>(
+            op.getLoc(), TypeAttr::get(const_uniform_quantized_type),
+            cast<DenseIntElementsAttr>(stablehlo_const_op.getValue()));
+      }
+      return nullptr;
+    };
+
+    lhs_qconst_op = GetBroadcastedConstOp(op.getLhs());
+    rhs_qconst_op = GetBroadcastedConstOp(op.getRhs());
+
+    rewriter.replaceOpWithNewOp<TFL::AddOp>(
+        op, op.getResult().getType(),
+        lhs_qconst_op ? lhs_qconst_op : op.getOperand(0),
+        rhs_qconst_op ? rhs_qconst_op : op.getOperand(1),
+        /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+  }
+};
+
+// Rewrites quantized `stablehlo.constant` to `tfl.pseudo_qconst`.
+class RewriteQuantizedConstantOp
+    : public OpRewritePattern<stablehlo::ConstantOp> {
+ public:
+  using OpRewritePattern<stablehlo::ConstantOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::ConstantOp op) const override {
+    return success(IsQuantizedTensorType(op.getOutput().getType()));
+  }
+
+  void rewrite(stablehlo::ConstantOp op,
+               PatternRewriter& rewriter) const override {
+    rewriter.replaceOpWithNewOp<TFL::QConstOp>(
+        op, /*qtype=*/TypeAttr::get(op.getOutput().getType()),
+        /*value=*/op.getValue());
+  }
+};
+
+// Splits dot-like hybrid quantized StableHLO ops into `tfl.dequantize` and
+// float StableHLO op. Legalization of float StableHLO op depends on existing
+// passes for conversion of StableHLO -> MHLO -> TF -> TFL.
+template <typename OpType>
+class RewriteHybridQuantizedDotLikeOp : public OpRewritePattern<OpType> {
+ public:
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  LogicalResult match(OpType op) const override {
+    if (op->getNumOperands() != 2 || op->getNumResults() != 1) {
+      return failure();
+    }
+    // Lhs and result should not be quantized and rhs should be quantized.
+    return success(!IsQuantizedTensorType(op->getOperand(0).getType()) &&
+                   IsQuantizedTensorType(op->getOperand(1).getType()) &&
+                   !IsQuantizedTensorType(op->getResult(0).getType()));
+  }
+
+  void rewrite(OpType op, PatternRewriter& rewriter) const override {
+    Value rhs = op.getOperand(1);
+    Type lhs_element_type =
+        op.getOperand(0).getType().template cast<TensorType>().getElementType();
+    Type dequantized_rhs_type =
+        quant::CloneTypeWithNewElementType(rhs.getType(), lhs_element_type);
+    auto dq = rewriter.create<TFL::DequantizeOp>(
+        op->getLoc(), /*output=*/dequantized_rhs_type,
+        /*input=*/rhs);
+    rewriter.replaceAllUsesExcept(rhs, dq.getOutput(), dq);
+  }
+};
+
 void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<RewriteUniformQuantizeOp, RewriteUniformDequantizeOp,
+  patterns.add<RewriteHybridQuantizedDotLikeOp<stablehlo::ConvolutionOp>,
+               RewriteHybridQuantizedDotLikeOp<stablehlo::DotGeneralOp>,
+               RewriteUniformDequantizeOp, RewriteUniformQuantizeOp,
+               RewriteQuantizedAddOp, RewriteQuantizedBroadcastInDimOp,
+               RewriteQuantizedConcatenateOp, RewriteQuantizedConstantOp,
+               RewriteQuantizedConvolutionOp,
                RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp,
-               RewriteQuantizedConvolutionOp, RewriteQuantizedTransposeOp,
-               RewriteQuantizedReshapeOp, RewriteQuantizedSelectOp,
-               RewriteQuantizedConcatenateOp, RewriteQuantizedPadOp,
-               RewriteQuantizedSliceOp, RewriteQuantizedBroadcastInDimOp,
-               RewriteQuantizedReduceWindowOpWithMax,
-               RewriteQuantizedDynamicReshapeOp, RewriteQuantizedGatherOp>(
-      &ctx);
+               RewriteQuantizedDynamicReshapeOp, RewriteQuantizedDynamicSliceOp,
+               RewriteQuantizedGatherOp, RewriteQuantizedPadOp,
+               RewriteQuantizedReduceWindowOpWithMax, RewriteQuantizedReshapeOp,
+               RewriteQuantizedSelectOp, RewriteQuantizedSliceOp,
+               RewriteQuantizedTransposeOp>(&ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index e1687b22816be0..2afbe2a0d2c766 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir
new file mode 100644
index 00000000000000..56068d605016e7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/cast_bf16.mlir
@@ -0,0 +1,12 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+// Ensure cast with bfloat16 roundtrip exactly
+
+func.func @main(tensor<4x5xbf16>) -> tensor<4x5xbf16> {
+^bb0(%arg0: tensor<4x5xbf16>):
+  // CHECK-LABEL: @main
+  // CHECK:  (tensor<4x5xbf16>) -> tensor<4x5xf32>
+  // CHECK-NEXT:  (tensor<4x5xf32>) -> tensor<4x5xbf16>
+  %0 = "tfl.cast" (%arg0) : (tensor<4x5xbf16>) -> tensor<4x5xf32> loc("cast1")
+  %1 = "tfl.cast" (%0) : (tensor<4x5xf32>) -> tensor<4x5xbf16> loc("cast2")
+  func.return %1 : tensor<4x5xbf16>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 685efd5be0ca2d..a0b9f90a879507 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1875,6 +1875,18 @@ func.func @matmul_batchv3_unknown_dim(%arg0: tensor<?x10x15xf32>, %arg1: tensor<
 // CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<?x10x15xf32>, tensor<15x17xf32>) -> tensor<?x10x17xf32>
 }
 
+func.func @matmul_batchv3_unknown_dim_bf16(%arg0: tensor<?x4x5xbf16>, %arg1: tensor<5x6xf32>) -> tensor<?x4x6xbf16> {
+  %0 = "tf.Cast"(%arg0) : (tensor<?x4x5xbf16>) -> tensor<?x4x5xf32>
+  %1 = "tf.BatchMatMulV3"(%0, %arg1) {Ta = "tfdtype$DT_FLOAT", Tb = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", adj_x = false, adj_y = false} :
+(tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
+  %2 = "tf.Cast"(%1) : (tensor<?x4x6xf32>) -> tensor<?x4x6xbf16>
+  func.return %2 : tensor<?x4x6xbf16>
+// CHECK-LABEL: matmul_batchv3_unknown_dim_bf16
+// CHECK: [[CST:%.*]] = "tfl.cast"(%arg0) : (tensor<?x4x5xbf16>) -> tensor<?x4x5xf32>
+// CHECK: [[BMM:%.*]] = "tfl.batch_matmul"([[CST]], %arg1) {adj_x = false, adj_y = false} : (tensor<?x4x5xf32>, tensor<5x6xf32>) -> tensor<?x4x6xf32>
+// CHECK: "tfl.cast"([[BMM]]) : (tensor<?x4x6xf32>) -> tensor<?x4x6xbf16>
+}
+
 // -----
 
 func.func @select_v2_with_6d_broadcasting(%arg0: tensor<1x1x1x1x3x1xi1>, %arg1 : tensor<1x1x1x1x1x4xf32>, %arg2 : tensor<1x1x1x2x1x1xf32>) -> tensor<1x1x1x2x3x4xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
index d7d77f2e77a97b..76f453d1d3a8aa 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize_jax_random.mlir
@@ -3,31 +3,31 @@
 
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_normal(
 // CHECK-SAME:                                        %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<3x4xf32>> {
-// CHECK:           %[[VAL_0:.*]] = mhlo.constant dense<[3, 4]> : tensor<2xi32>
+// CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[3, 4]> : tensor<2xi32>
 // CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomStandardNormal", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<3x4xf32>
-// CHECK:           %[[VAL_2:.*]] = mhlo.tuple %[[VAL_1]] : tuple<tensor<3x4xf32>>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<3x4xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<3x4xf32>>
 // CHECK:         }
 func.func @tfl_wrapped_jax_random_normal(%arg0: tensor<2xui32>) -> tuple<tensor<3x4xf32>> {
   // This is a fake jax random normal body.
-  %0 = mhlo.constant dense<0.0> : tensor<12xf32>
-  %1 = "mhlo.reshape"(%0) : (tensor<12xf32>) -> tensor<3x4xf32>
-  %2 = "mhlo.tuple"(%1) : (tensor<3x4xf32>) -> tuple<tensor<3x4xf32>>
+  %0 = stablehlo.constant dense<0.0> : tensor<12xf32>
+  %1 = "stablehlo.reshape"(%0) : (tensor<12xf32>) -> tensor<3x4xf32>
+  %2 = "stablehlo.tuple"(%1) : (tensor<3x4xf32>) -> tuple<tensor<3x4xf32>>
   func.return %2 : tuple<tensor<3x4xf32>>
 }
 
 
 // CHECK-LABEL:   func @tfl_wrapped_jax_random_uniform(
 // CHECK-SAME:                                         %[[RNG:.*]]: tensor<2xui32>) -> tuple<tensor<1x2xf32>> {
-// CHECK:           %[[VAL_0:.*]] = mhlo.constant dense<[1, 2]> : tensor<2xi32>
+// CHECK:           %[[VAL_0:.*]] = stablehlo.constant dense<[1, 2]> : tensor<2xi32>
 // CHECK:           %[[VAL_1:.*]] = "tfl.custom"(%[[VAL_0]]) {custom_code = "RandomUniform", custom_option = #tfl<const_bytes : "0x">} : (tensor<2xi32>) -> tensor<1x2xf32>
-// CHECK:           %[[VAL_2:.*]] = mhlo.tuple %[[VAL_1]] : tuple<tensor<1x2xf32>>
+// CHECK:           %[[VAL_2:.*]] = stablehlo.tuple %[[VAL_1]] : tuple<tensor<1x2xf32>>
 // CHECK:           return %[[VAL_2]] : tuple<tensor<1x2xf32>>
 // CHECK:         }
 func.func @tfl_wrapped_jax_random_uniform(%arg0: tensor<2xui32>) -> tuple<tensor<1x2xf32>> {
   // This is a fake jax random uniform body.
-  %0 = mhlo.constant dense<0.0> : tensor<2xf32>
-  %1 = "mhlo.reshape"(%0) : (tensor<2xf32>) -> tensor<1x2xf32>
-  %2 = "mhlo.tuple"(%1) : (tensor<1x2xf32>) -> tuple<tensor<1x2xf32>>
+  %0 = stablehlo.constant dense<0.0> : tensor<2xf32>
+  %1 = "stablehlo.reshape"(%0) : (tensor<2xf32>) -> tensor<1x2xf32>
+  %2 = "stablehlo.tuple"(%1) : (tensor<1x2xf32>) -> tuple<tensor<1x2xf32>>
   func.return %2 : tuple<tensor<1x2xf32>>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir
new file mode 100644
index 00000000000000..83255ca39a4472
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/cast_bf16.mlir
@@ -0,0 +1,74 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -emit-custom-ops -o - | flatbuffer_to_string - | FileCheck %s
+
+func.func @main(tensor<4x5xbf16>) -> tensor<4x5xbf16> {
+^bb0(%arg0: tensor<4x5xbf16>):
+
+// CHECK:  {
+// CHECK-NEXT:      version: 3,
+// CHECK-NEXT:      operator_codes: [ {
+// CHECK-NEXT:        deprecated_builtin_code: 53,
+// CHECK-NEXT:        version: 7,
+// CHECK-NEXT:        builtin_code: CAST
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      subgraphs: [ {
+// CHECK-NEXT:        tensors: [ {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          type: BFLOAT16,
+// CHECK-NEXT:          buffer: 1,
+// CHECK-NEXT:          name: "arg0",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          buffer: 2,
+// CHECK-NEXT:          name: "cast1",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          shape: [ 4, 5 ],
+// CHECK-NEXT:          type: BFLOAT16,
+// CHECK-NEXT:          buffer: 3,
+// CHECK-NEXT:          name: "cast2",
+// CHECK-NEXT:          quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:          },
+// CHECK-NEXT:          has_rank: true
+// CHECK-NEXT:        } ],
+// CHECK-NEXT:        inputs: [ 0 ],
+// CHECK-NEXT:        outputs: [ 2 ],
+// CHECK-NEXT:        operators: [ {
+// CHECK-NEXT:          inputs: [ 0 ],
+// CHECK-NEXT:          outputs: [ 1 ]
+// CHECK-NEXT:        }, {
+// CHECK-NEXT:          inputs: [ 1 ],
+// CHECK-NEXT:          outputs: [ 2 ]
+// CHECK-NEXT:        } ],
+// CHECK-NEXT:        name: "main"
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      description: "MLIR Converted.",
+// CHECK-NEXT:      buffers: [ {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }, {
+// CHECK-NEXT:        data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      metadata: [ {
+// CHECK-NEXT:        name: "min_runtime_version",
+// CHECK-NEXT:        buffer: 4
+// CHECK-NEXT:      } ],
+// CHECK-NEXT:      signature_defs: [  ]
+// CHECK-NEXT:    }
+
+  %0 = "tfl.cast" (%arg0) : (tensor<4x5xbf16>) -> tensor<4x5xf32> loc("cast1")
+  %1 = "tfl.cast" (%0) : (tensor<4x5xf32>) -> tensor<4x5xbf16> loc("cast2")
+  func.return %1 : tensor<4x5xbf16>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 8548151458a26c..75c1a791eeca73 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -773,6 +773,22 @@ func.func @FuseReshapeAroundBMMLHS(%arg0: tensor<6x5x1024xf32>) -> tensor<6x5x81
   // CHECK: return %0 : tensor<6x5x8192xf32>
 }
 
+// CHECK-LABEL: @FuseReshapeAroundBMMLHSNegative
+func.func @FuseReshapeAroundBMMLHSNegative(%arg0: tensor<1x64xf32>, %arg1: tensor<1x64x1024xf32> ) -> (tensor<1x1024xf32> )  {
+  %cst = arith.constant dense<[1, 1024]> : tensor<2xi32>
+  %cst_0 = arith.constant dense<[1, 1, 64]> : tensor<3xi32>
+  %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x64xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
+  %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
+  %2 = "tfl.reshape"(%1, %cst) : (tensor<1x1x1024xf32>, tensor<2xi32>) -> tensor<1x1024xf32>
+  return %2 : tensor<1x1024xf32>
+  // CHECK: %cst = arith.constant dense<[1, 1024]> : tensor<2xi32>
+  // CHECK: %cst_0 = arith.constant dense<[1, 1, 64]> : tensor<3xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst_0) : (tensor<1x64xf32>, tensor<3xi32>) -> tensor<1x1x64xf32>
+  // CHECK: %1 = "tfl.batch_matmul"(%0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x1x64xf32>, tensor<1x64x1024xf32>) -> tensor<1x1x1024xf32>
+  // CHECK: %2 = "tfl.reshape"(%1, %cst) : (tensor<1x1x1024xf32>, tensor<2xi32>) -> tensor<1x1024xf32>
+  // CHECK: return %2 : tensor<1x1024xf32>
+}
+
 // CHECK-LABEL: @FuseReshapeAroundBMMNagativeTest
 func.func @FuseReshapeAroundBMMNagativeTest(%arg0: tensor<5x4x1x1024xf32>, %arg1: tensor<5x1024x8192xf32>) -> tensor<5x4x1x8192xf32> {
   %cst = arith.constant dense_resource<__elided__> : tensor<3xi32>
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 655aee59b77378..f4aa97069655e8 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -144,21 +144,18 @@ void AddPreQuantizationStableHloToTfPasses(
   pass_manager.addPass(
       mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
 
-  // Add CHLO to StableHLO Decompositions:
-  // This is needed since we are relying on XlaCallModule uses MHLO
-  // specific features like mhlo::ErfOp which aren't supported
-  // in StableHLO, but we have CHLO->StableHLO decompositions to legalize.
-  pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
-  pass_manager.addPass(
-      mlir::stablehlo::experimental::createChloRecomposeOpsPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloBasisOpsPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
-  pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createShapeLegalizeToHloPass());
+  // Legalize MHLO to StableHLO should be moved closer to where it is needed
+  // There are some entry points that start with HLO->MHLO like
+  // jax_to_tfl_flatbuffer.cc which can likely be updated to emit StableHLO
+  // to be consistent with other entrypoints.
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 
+  // Decompose CHLO into StableHLO ops
+  // TODO(b/331843141): There are some CHLO's like TopK which we could instead
+  // lower to TFL ops.
+  mlir::stablehlo::experimental::createChloLegalizeToStablehloPipeline(
+      pass_manager);
+
   // The following two passes find specific uniform quantization patterns in
   // StableHLO and converts them to TFLite ops that accept or produce uniform
   // quantized types. They only target a specific set of models that contain
@@ -174,7 +171,6 @@ void AddPreQuantizationStableHloToTfPasses(
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::odml::CreateUniformQuantizedStableHloToTflPass());
 
-  pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   // Legalize jax random to tflite custom op.
   // The CreateLegalizeJaxRandom Pass has to stay at because we need to replace
   // the random function body before being inlined.
@@ -182,6 +178,7 @@ void AddPreQuantizationStableHloToTfPasses(
       mlir::TFL::CreateLegalizeJaxRandomPass());
 
   // Canonicalize, CSE etc.
+  pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::createCanonicalizerPass());
   pass_manager.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
@@ -231,6 +228,10 @@ void AddPostQuantizationStableHloToTfPasses(
     pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   }
 
+  if (pass_config.enable_composite_direct_lowering) {
+    pass_manager.addPass(mlir::odml::CreateCompositeLoweringPass());
+  }
+
   // TFLite dialect passes.
   if (!pass_config.disable_hlo_to_tfl_conversion) {
     pass_manager.addPass(mlir::odml::CreateLegalizeHloToTfLitePass());
@@ -252,6 +253,16 @@ void AddPostQuantizationStableHloToTfPasses(
 
   // Legalize all remaining mhlo ops to stableHLO
   pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+
+  // Translate "stablehlo.custom_call @stablehlo.composite" to
+  // "stablehlo.composite"
+  // TODO: b/330741524 - clean this up when "stablehlo.composite" is emitted
+  // directly. Additionally remove the composite to custom once ODML long term
+  // solution lands.
+  pass_manager.addPass(
+      mlir::odml::createLegalizeStablehloCustomCallToCompositePass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::createLegalizeCompositeToCustomOpPass());
 }
 
 // This is the early part of the conversion in isolation. This enables a caller
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 7a3f06bb376784..ac4de7f82b23d0 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -497,6 +497,13 @@ absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
     options.metadata.insert(
         MetadataForReducedPrecisionSupport(quant_specs.support_mask));
   }
+  pass_manager.clear();
+  pass_manager.addPass(mlir::odml::createLegalizeStablehloToVhloPass());
+  if (failed(pass_manager.run(module))) {
+    return status_handler.Combine(
+        absl::InvalidArgumentError("VHLO lowering failed"));
+  }
+
   if (!tflite::MlirToFlatBufferTranslateFunction(
           module, options, &translated_result, serialize_stablehlo_ops)) {
     return status_handler.Combine(
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index 208c20492c10f6..2f015e61d58fe6 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
index 72120f1502f021..e8bae6eb64280f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_jax_random.cc
@@ -47,10 +47,10 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
 namespace TFL {
@@ -99,7 +99,7 @@ void LegalizeJaxRandomPass::runOnOperation() {
   }
   auto result_shape_attr = builder.getI32TensorAttr(result_shape_i32);
   Value result_shape_tensor =
-      builder.create<mhlo::ConstantOp>(result_shape_attr);
+      builder.create<stablehlo::ConstantOp>(result_shape_attr);
   auto custom_code =
       IsJaxRandomUniform(func) ? "RandomUniform" : "RandomStandardNormal";
 
@@ -112,7 +112,7 @@ void LegalizeJaxRandomPass::runOnOperation() {
                                  ValueRange(result_shape_tensor_vec),
                                  custom_code, attr)
           .getResult(0);
-  Value tulple_result = builder.create<mhlo::TupleOp>(random_result);
+  Value tulple_result = builder.create<stablehlo::TupleOp>(random_result);
   builder.create<mlir::func::ReturnOp>(tulple_result);
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 7f55ca054383fa..401f34e6e7943c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -49,9 +49,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index c96266da31ddc9..0b068972c8fd30 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -1529,7 +1529,8 @@ def FuseReshapesAroundBatchMatMulLHS1: Pat<
       $rhs, $adj_x, $adj_y, $bool_attr),
     (Arith_ConstantOp $s1)),
   (TFL_BatchMatMulOp $input, $rhs, $adj_x, $adj_y, $bool_attr),
-  [(HasRank<3> $rhs),
+  [(HasRankAtLeast<3> $input),
+   (HasRank<3> $rhs),
    (HasRank<3> $initial_shape_change),
    (IsBroadcastDimEqualToOne $rhs),
    (IsBroadcastDimEqualToOne $input),
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index 988ad189a6ec00..eefb109d2b966e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -108,7 +108,7 @@ def LegalizeHashTablesPass : Pass<"tfl-legalize-hashtables-tf", "mlir::ModuleOp"
 def LegalizeJaxRandomPass : Pass<"tfl-legalize-random", "mlir::func::FuncOp"> {
   let summary = "Replace jax.random.uniform/normal with tfl.custom.";
   let constructor = "CreateLegalizeJaxRandomPass()";
-  let dependentDialects = ["TFL::TensorFlowLiteDialect"];
+  let dependentDialects = ["TFL::TensorFlowLiteDialect", "stablehlo::StablehloDialect"];
 }
 
 def LegalizeTFPass : Pass<"tfl-legalize-tf", "mlir::func::FuncOp"> {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 6694db441b6566..ce11ca73970136 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -43,7 +43,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index da5a941179deb8..e102c6bedd4328 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -38,8 +38,8 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 41a3144da6fe87..9f0a7fbafff450 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -59,9 +59,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/dilated_conv.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 33580d1ea95dbc..0d9db051ef27ff 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
diff --git a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
index 5ce7638f4e4da1..96d75cca30a48d 100644
--- a/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/const_tensor_utils.cc
@@ -345,22 +345,41 @@ StatusOr<mlir::ElementsAttr> ConvertFloatBuffer(
   switch (elem_type.getIntOrFloatBitWidth()) {
     case 16: {
       assert(bytes_len % 2 == 0);
-      assert(elem_type.isF16());
+      // Supports both BF16 and F16.
+      assert(elem_type.isF16() || elem_type.isBF16());
       int elem_count = bytes_len / 2;
-      std::vector<Eigen::half> values;
-      values.reserve(elem_count);
 
-      const char* data = reinterpret_cast<const char*>(buffer.data());
+      if (elem_type.isF16()) {
+        std::vector<Eigen::half> values;
+        values.reserve(elem_count);
 
-      for (int i = 0; i < elem_count; i++) {
-        uint16_t bit_repr =
-            llvm::support::endian::readNext<uint16_t, llvm::endianness::native,
-                                            llvm::support::unaligned>(data);
-        values.push_back(Eigen::numext::bit_cast<Eigen::half>(bit_repr));
-      }
+        const char* data = reinterpret_cast<const char*>(buffer.data());
 
-      return mlir::ElementsAttr(
-          DenseElementsAttr::get(shaped_type, ArrayRef<Eigen::half>(values)));
+        for (int i = 0; i < elem_count; i++) {
+          uint16_t bit_repr = llvm::support::endian::readNext<
+              uint16_t, llvm::endianness::native, llvm::support::unaligned>(
+              data);
+          values.push_back(Eigen::numext::bit_cast<Eigen::half>(bit_repr));
+        }
+
+        return mlir::ElementsAttr(
+            DenseElementsAttr::get(shaped_type, ArrayRef<Eigen::half>(values)));
+      } else {
+        std::vector<Eigen::bfloat16> values;
+        values.reserve(elem_count);
+
+        const char* data = reinterpret_cast<const char*>(buffer.data());
+
+        for (int i = 0; i < elem_count; i++) {
+          uint16_t bit_repr = llvm::support::endian::readNext<
+              uint16_t, llvm::endianness::native, llvm::support::unaligned>(
+              data);
+          values.push_back(Eigen::numext::bit_cast<Eigen::bfloat16>(bit_repr));
+        }
+
+        return mlir::ElementsAttr(DenseElementsAttr::get(
+            shaped_type, ArrayRef<Eigen::bfloat16>(values)));
+      }
     }
     case 32: {
       assert(bytes_len % 4 == 0);
diff --git a/tensorflow/compiler/mlir/lite/utils/convert_type.cc b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
index 9b215e77b89529..e09030ceb7515f 100644
--- a/tensorflow/compiler/mlir/lite/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/lite/utils/convert_type.cc
@@ -34,6 +34,8 @@ namespace errors = tensorflow::errors;
 tflite::TensorType ConvertTypeToTensorType(mlir::Type type) {
   if (type.isF16()) {
     return tflite::TensorType_FLOAT16;
+  } else if (type.isBF16()) {
+    return tflite::TensorType_BFLOAT16;
   } else if (type.isF32()) {
     return tflite::TensorType_FLOAT32;
   } else if (type.isF64()) {
@@ -81,6 +83,8 @@ mlir::Type ConvertElementType(tflite::TensorType type, mlir::Builder builder) {
   switch (type) {
     case tflite::TensorType_FLOAT16:
       return builder.getF16Type();
+    case tflite::TensorType_BFLOAT16:
+      return builder.getBF16Type();
     case tflite::TensorType_FLOAT32:
       return builder.getF32Type();
     case tflite::TensorType_FLOAT64:
@@ -128,6 +132,8 @@ tensorflow::DataType TflTypeToTfType(tflite::TensorType type) {
       return tensorflow::DT_COMPLEX128;
     case tflite::TensorType_FLOAT16:
       return tensorflow::DT_HALF;
+    case tflite::TensorType_BFLOAT16:
+      return tensorflow::DT_BFLOAT16;
     case tflite::TensorType_FLOAT32:
       return tensorflow::DT_FLOAT;
     case tflite::TensorType_FLOAT64:
@@ -170,6 +176,8 @@ absl::StatusOr<tflite::TensorType> TfTypeToTflType(tensorflow::DataType type) {
       return tflite::TensorType_COMPLEX128;
     case tensorflow::DT_HALF:
       return tflite::TensorType_FLOAT16;
+    case tensorflow::DT_BFLOAT16:
+      return tflite::TensorType_BFLOAT16;
     case tensorflow::DT_FLOAT:
       return tflite::TensorType_FLOAT32;
     case tensorflow::DT_DOUBLE:
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index 8e4c39b8d5f1b7..8091fe21ef56ff 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -41,6 +41,7 @@ cc_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core/ir/types:Dialect",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -48,6 +49,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
@@ -60,6 +62,7 @@ tf_cc_test(
         ":test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -68,6 +71,7 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -109,6 +113,7 @@ cc_library(
     hdrs = ["test_base.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":func",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
@@ -122,6 +127,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 42ecca536f54a5..852902e229a9fc 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
 
+#include <array>
 #include <cstdint>
 #include <optional>
 #include <type_traits>
@@ -40,10 +41,19 @@ namespace mlir::quant {
 
 constexpr char kAttrMapAttribute[] = "attr_map";
 
-// TODO: b/238829558 - Populate quantization config based on the
-// QuantizationOptions proto.
-// TODO: b/263449239 - Put the OpSet aliases separately within each file
-using OpSet = tensorflow::quantization::OpSet;
+// Permutation from the NHWC tensor format to NCHW. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNhwcToNchwPermutation = {0, 3, 1, 2};
+
+// Permutation from the NCHW tensor format to NHWC. This is an inverse
+// permutation of `kNchwToNhwcPermutation`.
+inline constexpr std::array<int64_t, 4> kNchwToNhwcPermutation = {0, 2, 3, 1};
+
+// Permutation from the OIHW (== (output features, input features, height,
+// width)) tensor format to HWIO. This is commonly used to transpose convolution
+// weights represented as OIHW format to HWIO, which is more desirable for
+// certain downstream optimization passes (e.g. XLA).
+inline constexpr std::array<int64_t, 4> kOihwToHwioPermutation = {2, 3, 1, 0};
 
 // Returns true if the value has static shape.
 bool HasStaticShape(Value value);
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
index 6ec7285a8e7406..f6e633aa4c7861 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
@@ -72,7 +72,7 @@ constexpr absl::string_view kModuleMultipleUses = R"mlir(
   module {
     func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
-      %1 = stablehlo.subtract %0, %arg2 : tensor<1x3xf32>
+      %1 = stablehlo.subtract %arg2, %0 : tensor<1x3xf32>
       %2 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
       return %2 : tensor<1x3xf32>
     }
@@ -411,9 +411,8 @@ TEST_F(AttrsAndConstraintsTest, HasQuantizableTraitFalse) {
 }
 
 TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpTrue) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(kModuleHybridQuantized);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleHybridQuantized);
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
   Operation* dot_general = FindOperationOfType<DotGeneralOp>(main_fn);
@@ -421,9 +420,8 @@ TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpTrue) {
 }
 
 TEST_F(AttrsAndConstraintsTest, IsHybridQuantizedOpFalse) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(kModuleXlaCallModule);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleXlaCallModule);
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
   Operation* call_op = FindOperationOfType<TF::XlaCallModuleOp>(main_fn);
@@ -453,17 +451,25 @@ constexpr absl::string_view kModuleDotGeneralBatchMatmul = R"mlir(
 )mlir";
 
 TEST_F(AttrsAndConstraintsTest, DotGeneralFullyConnectedReturnsQuantDim) {
-  OwningOpRef<ModuleOp> module_op_ref =
+  OwningOpRef<ModuleOp> module_op =
       ParseModuleOpString(kModuleDotGeneralFullyConnected);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
   auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
   EXPECT_THAT(GetDotGeneralQuantizationDim(dot_general_op), Optional(1));
 }
 
 TEST_F(AttrsAndConstraintsTest, DotGeneralBatchMatmulReturnsNullQuantDim) {
-  OwningOpRef<ModuleOp> module_op_ref =
+  OwningOpRef<ModuleOp> module_op =
       ParseModuleOpString(kModuleDotGeneralBatchMatmul);
-  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
   auto dot_general_op = *main_fn.getOps<DotGeneralOp>().begin();
   EXPECT_THAT(GetDotGeneralQuantizationDim(dot_general_op), Eq(std::nullopt));
 }
diff --git a/tensorflow/compiler/mlir/quantization/common/ir/BUILD b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
index 1f62dff9711d80..615f54f70d2373 100644
--- a/tensorflow/compiler/mlir/quantization/common/ir/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/ir/BUILD
@@ -57,15 +57,24 @@ gentbl_cc_library(
 cc_library(
     name = "QuantOps",
     srcs = [
+        "FakeQuantSupport.cc",
         "QuantOps.cc",
+        "UniformSupport.cc",
+    ],
+    hdrs = [
+        "FakeQuantSupport.h",
+        "QuantOps.h",
+        "UniformSupport.h",
     ],
-    hdrs = ["QuantOps.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":QuantOpsIncGen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
similarity index 88%
rename from tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc
rename to tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
index 9b662ebdca8461..292e0eeb3cce71 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.cc
@@ -13,12 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 
-using namespace mlir;
-using namespace mlir::quantfork;
+namespace mlir::quantfork {
 
 static bool getDefaultStorageParams(unsigned numBits, bool narrowRange,
                                     bool isSigned, MLIRContext *ctx,
@@ -121,9 +131,11 @@ static void getNudgedScaleAndZeroPoint(int64_t qmin, int64_t qmax, double rmin,
   assert(nudgedZeroPoint <= qmax);
 }
 
-quant::UniformQuantizedType mlir::quantfork::fakeQuantAttrsToType(
-    Location loc, unsigned numBits, double rmin, double rmax, bool narrowRange,
-    Type expressedType, bool isSigned) {
+quant::UniformQuantizedType fakeQuantAttrsToType(Location loc, unsigned numBits,
+                                                 double rmin, double rmax,
+                                                 bool narrowRange,
+                                                 Type expressedType,
+                                                 bool isSigned) {
   MLIRContext *ctx = expressedType.getContext();
   unsigned flags = isSigned ? quant::QuantizationFlags::Signed : 0;
   Type storageType;
@@ -152,7 +164,7 @@ quant::UniformQuantizedType mlir::quantfork::fakeQuantAttrsToType(
                                                  nudgedZeroPoint, qmin, qmax);
 }
 
-quant::UniformQuantizedPerAxisType mlir::quantfork::fakeQuantAttrsToType(
+quant::UniformQuantizedPerAxisType fakeQuantAttrsToType(
     Location loc, unsigned numBits, int32_t quantizedDimension,
     ArrayRef<double> rmins, ArrayRef<double> rmaxs, bool narrowRange,
     Type expressedType, bool isSigned) {
@@ -198,3 +210,5 @@ quant::UniformQuantizedPerAxisType mlir::quantfork::fakeQuantAttrsToType(
       loc, flags, storageType, expressedType, scales, zeroPoints,
       quantizedDimension, qmin, qmax);
 }
+
+}  // namespace mlir::quantfork
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
similarity index 93%
rename from tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h
rename to tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
index 6072172eaebe38..335f80782a5e20 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h
@@ -41,8 +41,8 @@ limitations under the License.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 
@@ -71,4 +71,4 @@ quant::UniformQuantizedPerAxisType fakeQuantAttrsToType(
 }  // namespace quantfork
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_FAKEQUANTSUPPORT_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_FAKEQUANTSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
similarity index 97%
rename from tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc
rename to tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
index e5c3dd35a27981..5a200241af00dd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.cc
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 
 #include <numeric>
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
similarity index 97%
rename from tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h
rename to tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
index 064afb0b36aa13..b6f65e455d0c09 100644
--- a/tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h
+++ b/tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
-#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
 
 #include <utility>
 
@@ -237,4 +237,4 @@ class UniformQuantizedPerAxisValueConverter {
 }  // namespace quantfork
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_IR_UNIFORMSUPPORT_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_IR_UNIFORMSUPPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
index 9c700ed50bc4d0..050bf45d7b5a46 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
@@ -69,30 +70,34 @@ constexpr int64_t kDefaultVersion = 9;
 constexpr StringRef kPlatformCpu = "CPU";
 // Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
 // deserialized stablehlo module's attributes.
-constexpr llvm::StringRef kStablehloModuleAttrsAttrName =
-    "_stablehlo_module_attrs";
+constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
 // Attribute required for running shape refinement pass enabled in XlaCallModule
 // version 8 and above.
-constexpr llvm::StringRef kUsesShapePolymorphismAttr =
-    "jax.uses_shape_polymorphism";
+constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
 
-// Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation& op) {
-  return op.getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+bool IsInLiftedFunc(Operation* op) {
+  if (op == nullptr) return false;
+  return op->getParentOfType<func::FuncOp>()->hasAttr(kFusedFunctionAttr);
+}
+
+bool IsInStableHloOpRegion(Operation* op) {
+  if (op == nullptr) return false;
+  auto parent_op = op->getParentOp();
+  return parent_op != nullptr && stablehlo::IsStablehloOp(parent_op);
 }
 
 // Inserts the function to the symbol table of the module thread-safely.
 StringAttr InsertToSymbolTable(Operation& module, Operation& function,
-                               const std::string& func_name) {
+                               const StringRef func_name) {
   static tensorflow::mutex* mtx = new tensorflow::mutex();
   tensorflow::mutex_lock lock(*mtx);
 
   SymbolTable symbol_table(&module);
-  std::string unique_name = func_name;
+  std::string unique_name = func_name.str();
   int32_t uniquing_counter = 0;
   while (symbol_table.lookup(unique_name) != nullptr) {
     ++uniquing_counter;
-    unique_name = func_name + "_" + std::to_string(uniquing_counter);
+    unique_name = absl::StrCat(func_name.str(), "_", uniquing_counter);
   }
   function.setAttr("sym_name",
                    StringAttr::get(module.getContext(), unique_name));
@@ -101,9 +106,11 @@ StringAttr InsertToSymbolTable(Operation& module, Operation& function,
 
 // Creates the TF::PartitionedCallOp with the given arguments and output types.
 // This function call op is for invoking the TF subgraphs.
-ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
-                                     StringRef func_name,
-                                     TypeRange output_types, ValueRange args) {
+ValueRange CreateTFPartitionedCallOp(OpBuilder& builder,
+                                     const Location location,
+                                     const StringRef func_name,
+                                     const TypeRange output_types,
+                                     const ValueRange args) {
   TF::PartitionedCallOp call_op = builder.create<TF::PartitionedCallOp>(
       location, output_types, args,
       FlatSymbolRefAttr::get(builder.getStringAttr(func_name)),
@@ -112,7 +119,7 @@ ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
   // Set the attribute to annotate this function call op as a quantizable spot.
   call_op->setAttr(
       kQuantTraitAttrName,
-      builder.getStringAttr(llvm::StringRef(
+      builder.getStringAttr(StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
   return call_op.getOutput();
@@ -120,10 +127,11 @@ ValueRange createTFPartitionedCallOp(OpBuilder builder, Location location,
 
 // Creates the TF::XlaCallModuleOp with the given arguments and output types.
 // This function call op is for invoking the StableHLO subgraphs.
-ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
-                                   StringRef func_name, TypeRange output_types,
-                                   ValueRange args) {
-  auto ctx = builder.getContext();
+ValueRange CreateTFXlaCallModuleOp(OpBuilder& builder, const Location location,
+                                   const StringRef func_name,
+                                   const TypeRange output_types,
+                                   const ValueRange args) {
+  MLIRContext* ctx = builder.getContext();
   // Collect the shapes of the output to fill up the Sout attribute.
   SmallVector<Attribute> shape_attrs;
   for (const Type result_type : output_types) {
@@ -133,7 +141,7 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
   auto empty_array_attr = ArrayAttr::get(ctx, {});
   auto platforms = ArrayAttr::get(ctx, {StringAttr::get(ctx, kPlatformCpu)});
 
-  TF::XlaCallModuleOp call_op = builder.create<TF::XlaCallModuleOp>(
+  auto call_op = builder.create<TF::XlaCallModuleOp>(
       location,
       /*output=*/output_types,
       /*args=*/args,
@@ -159,7 +167,7 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
   // Set the attribute to annotate this function call op as a quantizable spot.
   call_op->setAttr(
       kQuantTraitAttrName,
-      builder.getStringAttr(llvm::StringRef(
+      builder.getStringAttr(StringRef(
           std::string(QuantTraitValues[QuantizationTrait::FullyQuantizable]))));
 
   // Set jax.uses_shape_polymorphism=true to enable shape refinement at runtime.
@@ -172,27 +180,25 @@ ValueRange createTFXlaCallModuleOp(OpBuilder builder, Location location,
 }
 
 // Creates the function call op based on the given call_op_type argument.
-ValueRange createFunctionCallOp(OpBuilder builder, Location location,
-                                FunctionCallOpType call_op_type,
-                                StringRef func_name, TypeRange output_types,
-                                ValueRange args) {
+ValueRange CreateFunctionCallOp(OpBuilder& builder, const Location location,
+                                const FunctionCallOpType call_op_type,
+                                const StringRef func_name,
+                                const TypeRange output_types,
+                                const ValueRange args) {
   switch (call_op_type) {
     case FunctionCallOpType::TFXlaCallModuleOp:
-      return createTFXlaCallModuleOp(builder, location, func_name, output_types,
+      return CreateTFXlaCallModuleOp(builder, location, func_name, output_types,
                                      args);
     case FunctionCallOpType::TFPartitionedCallOp:
-      return createTFPartitionedCallOp(builder, location, func_name,
+      return CreateTFPartitionedCallOp(builder, location, func_name,
                                        output_types, args);
-    default:
-      llvm_unreachable("unhandled call op type");
   }
 }
 
 // Finds ops in the paths from arguments to results. The ops is listed in an
 // order that the former ops shouldn't have any dependencies on the later ones.
-llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
-    const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results) {
+SmallVector<Operation*> FindOpsFromArgumentsToResults(
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results) {
   std::queue<Value> value_queue;
   for (Value result : results) {
     value_queue.push(result);
@@ -213,7 +219,7 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
     Operation* defining_node = current_value.getDefiningOp();
     if (defining_node == nullptr) continue;
     op_stack.push(defining_node);
-    for (const auto& arg : defining_node->getOperands()) {
+    for (Value arg : defining_node->getOperands()) {
       if (!argument_set.contains(arg.getImpl())) {
         value_queue.push(arg);
       }
@@ -221,7 +227,7 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
   }
 
   // Remove duplicate ops from the op stack.
-  llvm::SmallVector<Operation*> sorted_ops;
+  SmallVector<Operation*> sorted_ops;
   absl::flat_hash_set<Operation*> unique_ops;
   while (!op_stack.empty()) {
     Operation* current_op = op_stack.top();
@@ -243,9 +249,9 @@ llvm::SmallVector<Operation*> FindOpsFromArgumentsToResults(
 // "0:transpose_a,1:transpose_b", where 0 and 1 are the respective attribute
 // identifiers.
 // This function returns success if all attributes could be found.
-LogicalResult SetAttributeMap(
-    MLIRContext& context, const llvm::SmallVector<NamedAttribute>& attributes,
-    const llvm::SmallVector<Operation*>& ops) {
+LogicalResult SetAttributeMap(MLIRContext& context,
+                              const ArrayRef<NamedAttribute> attributes,
+                              const ArrayRef<Operation*> ops) {
   // A map to find which operation an attribute belongs to.
   // The key for this map uses the entire NamedAttribute object, i.e. the
   // {attribute_name, attribute_value} pair.
@@ -270,8 +276,8 @@ LogicalResult SetAttributeMap(
             attr_to_op_map.begin(), attr_to_op_map.end(), [&](auto attr_op) {
               return std::get<0>(attr_op).getName() == attribute.getName();
             }) == attr_to_op_map.end()) {
-      mlir::emitError(UnknownLoc::get(&context),
-                      "Could not find attribute: " + attribute.getName().str());
+      emitError(UnknownLoc::get(&context),
+                "Could not find attribute: " + attribute.getName().str());
       return failure();
     }
 
@@ -293,7 +299,7 @@ LogicalResult SetAttributeMap(
 
       // Append "<identifier>:<attribute_name>". Ex) "0:transpose_a".
       const std::string identifier = std::to_string(idx);
-      const mlir::StringAttr attribute_name = attribute.getName();
+      const StringAttr attribute_name = attribute.getName();
       absl::StrAppend(&new_attr_map_str, identifier, ":", attribute_name.str());
       owner_op->setAttr(kAttrMapAttribute,
                         StringAttr::get(&context, new_attr_map_str));
@@ -303,14 +309,14 @@ LogicalResult SetAttributeMap(
 }
 
 // Creates a function to wrap the section between arguments and results.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results,
-    const llvm::SmallVector<NamedAttribute>& attributes) {
+SmallVector<Value, 4> LiftAsFunctionCall(
+    OpBuilder& builder, const Location location,
+    const FunctionCallOpType call_op_type, const StringRef func_name,
+    const ArrayRef<Value> arguments, const ArrayRef<Value> results,
+    const ArrayRef<NamedAttribute> attributes) {
   MLIRContext* context = builder.getContext();
   if (results.empty()) {
-    mlir::emitError(UnknownLoc::get(context), "No result values specified");
+    emitError(UnknownLoc::get(context), "No result values specified");
     return {};
   }
   Operation* result_op = results[0].getDefiningOp();
@@ -324,10 +330,11 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
   TypeRange result_types{ValueRange{results}};
   auto func_type = FunctionType::get(context, arg_types, result_types);
 
-  llvm::SmallVector<Location> arg_locs;
-  for (const auto& arg : arguments) {
+  SmallVector<Location> arg_locs;
+  for (Value arg : arguments) {
     arg_locs.push_back(arg.getLoc());
   }
+
   auto wrap_func = builder.create<func::FuncOp>(location, func_name, func_type);
   wrap_func.setVisibility(SymbolTable::Visibility::Private);
   // The callee function for TF::XlaCallModuleOp must have this attribute.
@@ -361,34 +368,36 @@ llvm::SmallVector<Value, 4> LiftAsFunctionCall(
     builder.clone(*op, mapping);
   }
 
-  llvm::SmallVector<Value> return_values;
+  SmallVector<Value> return_values;
   for (Value result : results) {
     return_values.push_back(mapping.lookupOrNull(result));
   }
-  builder.create<mlir::func::ReturnOp>(location, return_values);
+  builder.create<func::ReturnOp>(location, return_values);
 
   // Create a function call to the newly created function.
   StringAttr new_func_name =
-      InsertToSymbolTable(*module, *wrap_func, func_name.str());
+      InsertToSymbolTable(*module, *wrap_func, func_name);
   builder.setInsertionPointAfter(result_op);
   ValueRange new_results =
-      createFunctionCallOp(builder, call_op_loc, call_op_type,
+      CreateFunctionCallOp(builder, call_op_loc, call_op_type,
                            new_func_name.getValue(), result_types, arguments);
-  return llvm::SmallVector<Value, 4>(new_results.begin(), new_results.end());
+  return SmallVector<Value, 4>(new_results.begin(), new_results.end());
 }
 
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value>& arguments,
-    const llvm::SmallVector<Value>& results) {
-  llvm::SmallVector<NamedAttribute> attributes;
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder,
+                                         const Location location,
+                                         const FunctionCallOpType call_op_type,
+                                         const StringRef func_name,
+                                         const ArrayRef<Value> arguments,
+                                         const ArrayRef<Value> results) {
+  SmallVector<NamedAttribute> attributes;
   return LiftAsFunctionCall(builder, location, call_op_type, func_name,
                             arguments, results, attributes);
 }
 
-llvm::SmallVector<Value> AppendToVector(
-    const llvm::SmallVector<Value>& arguments, Value append) {
-  llvm::SmallVector<Value> ret(arguments);
+SmallVector<Value> AppendToVector(const ArrayRef<Value> arguments,
+                                  Value append) {
+  SmallVector<Value> ret(arguments);
   ret.push_back(append);
   return ret;
 }
@@ -402,7 +411,7 @@ llvm::SmallVector<Value> AppendToVector(
 //    could process the following equation by setting the attributes properly:
 //    abc,cd->abd.
 // 4. The output should be in the form: [batch dims][lhs dims][rhs dims]
-bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr) {
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr) {
   StringRef equation = equation_attr.getValue();
 
   if (!absl::StrContains(equation, "->") || !absl::StrContains(equation, ",") ||
@@ -489,4 +498,15 @@ absl::StatusOr<Method> GetQuantizationMethod(
   return quantization_method;
 }
 
+Method GetQuantizationMethodOrDefault(TF::XlaCallModuleOp xla_call_module_op) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+  if (method.status().code() == absl::StatusCode::kInternal) {
+    // This indicates that the `Method` protobuf string is corrupt, but this
+    // function ignores it and returns the default instance.
+    xla_call_module_op->emitError(absl::StrCat(
+        "Failed to get quantization method: ", method.status().ToString()));
+  }
+  return method.ok() ? *method : Method::default_instance();
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
index f2edd732f50cc5..bd7421d376102b 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h
@@ -47,11 +47,16 @@ inline constexpr StringRef kQuantizationMethodAttr = "_quantization_method";
 // function lifting will happen.
 enum FunctionCallOpType { TFPartitionedCallOp = 0, TFXlaCallModuleOp = 1 };
 
-// Checks if the op is inside a lifted function.
-bool IsInLiftedFunc(Operation &op);
+// Checks if an op is inside a lifted function.
+// If the given op pointer is a nullptr, returns false.
+bool IsInLiftedFunc(Operation* op);
 
-// Checks if the given einsum op is supported for XlaDotV2 quantization.
-bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr);
+// Checks if the op is inside a StableHLO op with region.
+// If the given op pointer is a nullptr, returns false.
+bool IsInStableHloOpRegion(Operation* op);
+
+// Checks if a given einsum op is supported for XlaDotV2 quantization.
+bool IsEinsumSupportedByXlaDotV2(StringAttr equation_attr);
 
 // Gets the quantization method from the given `XlaCallModuleOp`. It is
 // retrieved from the `kQuantizationMethodAttr` string attribute. Returns
@@ -60,27 +65,35 @@ bool IsEinsumSupportedByXlaDotV2(mlir::StringAttr equation_attr);
 absl::StatusOr<::stablehlo::quantization::Method> GetQuantizationMethod(
     TF::XlaCallModuleOp xla_call_module_op);
 
+// Gets the quantization method from the given `XlaCallModuleOp`. It is
+// retrieved from the `kQuantizationMethodAttr` string attribute. Returns a
+// default instance of `Method` iff the attribute doesn't exist or the attribute
+// contains an invalid textproto for `Method`.
+::stablehlo::quantization::Method GetQuantizationMethodOrDefault(
+    TF::XlaCallModuleOp xla_call_module_op);
+
 // Creates a function to wrap the section between arguments and results.
 // The generated function call op type will be decided by the given call_op_type
 // argument. Currently, it supports TF::XlaCallModuleOp and
 // TF::PartitionedCallOp function call op generations.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results,
-    const llvm::SmallVector<NamedAttribute> &attributes);
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results,
+                                         ArrayRef<NamedAttribute> attributes);
 
 // Same as above but with empty attributes.
-llvm::SmallVector<Value, 4> LiftAsFunctionCall(
-    OpBuilder builder, Location location, FunctionCallOpType call_op_type,
-    StringRef func_name, const llvm::SmallVector<Value> &arguments,
-    const llvm::SmallVector<Value> &results);
+SmallVector<Value, 4> LiftAsFunctionCall(OpBuilder& builder, Location location,
+                                         FunctionCallOpType call_op_type,
+                                         StringRef func_name,
+                                         ArrayRef<Value> arguments,
+                                         ArrayRef<Value> results);
 
 // Add the second argument to the first argument, which is expected to be an
 // argument list.
 // Used to attach bias to einsum argument list.
-llvm::SmallVector<Value> AppendToVector(
-    const llvm::SmallVector<Value> &arguments, Value append);
+SmallVector<Value> AppendToVector(ArrayRef<Value> arguments, Value append);
 
 }  // namespace mlir::quant
 
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
index a4437b50ac0cf0..1ca03a803bef4d 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td
@@ -59,7 +59,11 @@ class NamedAttr<string attr_name> :
 // Checks if the value is not defined inside a lifted function by checking the
 // `tf_quant.composite_function` attribute.
 def IsNotInLiftedFunc :
-      Constraint<CPred<"!IsInLiftedFunc(*$0.getDefiningOp())">>;
+      Constraint<CPred<"!IsInLiftedFunc($0.getDefiningOp())">>;
+
+// Checks if the value is not inside a StableHLO op with region.
+def IsNotInStableHloOpRegion :
+      Constraint<CPred<"!IsInStableHloOpRegion($0.getDefiningOp())">>;
 
 // Checks if the given einsum op is supported for XlaDotV2 quantization.
 def IsEinsumSupportedByXlaDotV2 :
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 3d1285928f5f18..c37a997217d2b7 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -30,11 +31,13 @@ limitations under the License.
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 
 namespace mlir::quant {
@@ -43,10 +46,11 @@ namespace {
 using ::stablehlo::quantization::Method;
 using ::testing::HasSubstr;
 using ::testing::NotNull;
+using ::tsl::protobuf::util::MessageDifferencer;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::StatusIs;
 
-using LiftAsFunctionCallTest = ::mlir::quant::QuantizationTestBase;
+using LiftAsFunctionCallTest = QuantizationTestBase;
 
 constexpr absl::string_view kModuleLifted = R"mlir(
   module {
@@ -65,10 +69,9 @@ TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
       module_op->lookupSymbol<func::FuncOp>("composite_dot_general_fn_1");
   ASSERT_THAT(composite_dot_general_fn, NotNull());
 
-  Operation* dot_general_op =
-      FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
-          composite_dot_general_fn);
-  EXPECT_TRUE(IsInLiftedFunc(*dot_general_op));
+  auto dot_general_op = FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
+      composite_dot_general_fn);
+  EXPECT_TRUE(IsInLiftedFunc(dot_general_op));
 }
 
 constexpr absl::string_view kModuleStableHlo = R"mlir(
@@ -87,7 +90,7 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
   func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
-  Operation* dot_general_op =
+  auto dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
 
   const SmallVector<NamedAttribute>& attributes = {
@@ -97,19 +100,20 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
               1, mlir::stablehlo::PrecisionAttr::get(
                      ctx_.get(), mlir::stablehlo::Precision::DEFAULT)))),
   };
+  const SmallVector<Value> operands(dot_general_op->getOperands());
+  const SmallVector<Value> results(dot_general_op->getResults());
   Operation* lifted_op =
       LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
                          FunctionCallOpType::TFXlaCallModuleOp,
-                         "composite_dot_general_fn",
-                         dot_general_op->getOperands(),
-                         dot_general_op->getResults(), attributes)[0]
+                         "composite_dot_general_fn", operands, results,
+                         attributes)[0]
           .getDefiningOp();
   const auto entry_function_symbol_ref =
       lifted_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function");
   SymbolTable symbol_table(*module_op);
   auto entry_func = dyn_cast_or_null<func::FuncOp>(
       symbol_table.lookup(entry_function_symbol_ref.getValue()));
-  Operation* lifted_dot_general_op =
+  auto lifted_dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(entry_func);
 
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
@@ -129,13 +133,14 @@ TEST_F(LiftAsFunctionCallTest, FunctionNoAttrLiftedAsXlaCallModuleOp) {
   func::FuncOp main_fn = FindMainFuncOp(*module_op);
   ASSERT_THAT(main_fn, NotNull());
 
-  Operation* dot_general_op =
+  auto dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
+  const SmallVector<Value> operands(dot_general_op->getOperands());
+  const SmallVector<Value> results(dot_general_op->getResults());
   Operation* lifted_op =
-      LiftAsFunctionCall(
-          builder_, dot_general_op->getLoc(),
-          FunctionCallOpType::TFXlaCallModuleOp, "composite_dot_general_fn",
-          dot_general_op->getOperands(), dot_general_op->getResults())[0]
+      LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
+                         FunctionCallOpType::TFXlaCallModuleOp,
+                         "composite_dot_general_fn", operands, results)[0]
           .getDefiningOp();
   EXPECT_TRUE(isa<TF::XlaCallModuleOp>(lifted_op));
   EXPECT_EQ(lifted_op->getAttr("_original_entry_function").cast<StringAttr>(),
@@ -242,5 +247,109 @@ TEST_F(LiftAsFunctionCallTest,
                        HasSubstr("Failed to parse Method from textproto")));
 }
 
+constexpr absl::string_view kFunctionWithRegion =
+    R"mlir(
+  func.func @main(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+    %if = "stablehlo.if"(%arg0) ({
+      %0 = stablehlo.add %arg1, %arg1 : tensor<f32>
+      stablehlo.return %0 : tensor<f32>
+    }, {
+      %1 = stablehlo.add %arg2, %arg2 : tensor<f32>
+      stablehlo.return %1 : tensor<f32>
+    }) : (tensor<i1>) -> (tensor<f32>)
+    %subtract = stablehlo.subtract %if, %if : tensor<f32>
+    return %subtract : tensor<f32>
+  }
+)mlir";
+
+TEST_F(LiftAsFunctionCallTest, IsInRegionSucceedsWhenOpInsideRegion) {
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kFunctionWithRegion);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto if_op = FindOperationOfType<mlir::stablehlo::IfOp>(main_fn);
+  Block& block = if_op->getRegion(0).front();
+  Operation& add_op = *absl::c_find_if(block, [](Operation& entry) {
+    return dyn_cast_or_null<::mlir::stablehlo::AddOp>(&entry);
+  });
+  EXPECT_TRUE(IsInStableHloOpRegion(&add_op));
+}
+
+TEST_F(LiftAsFunctionCallTest, IsInRegionFailsWhenOpNotInsideRegion) {
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kFunctionWithRegion);
+  ASSERT_TRUE(module_op);
+
+  func::FuncOp main_fn = FindMainFuncOp(*module_op);
+  ASSERT_THAT(main_fn, NotNull());
+
+  auto subtract_op = FindOperationOfType<mlir::stablehlo::SubtractOp>(main_fn);
+  EXPECT_FALSE(IsInStableHloOpRegion(subtract_op));
+}
+
+TEST_F(LiftAsFunctionCallTest,
+       GetQuantizationMethodOrDefaultReturnsCorrectMethod) {
+  // Function containing a simple `TF::XlaCallModuleOp` with a valid string
+  // attribute `_quantization_method` set to `"no_quantization { }"`.
+  constexpr absl::string_view kXlaCallModuleOpWithQuantizationMethodAttr =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_dot_general_fn_1,
+            _quantization_method = "no_quantization { }",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithQuantizationMethodAttr);
+  ASSERT_TRUE(module_op);
+
+  FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  // Test that `GetQuantizationMethodOrDefault` returns a valid `Method`
+  // corresponding to `"no_quantization {}"`.
+  const Method method = GetQuantizationMethodOrDefault(*xla_call_module_op);
+  EXPECT_TRUE(method.has_no_quantization());
+}
+
+TEST_F(
+    LiftAsFunctionCallTest,
+    GetQuantizationMethodOrDefaultReturnsDefaultWhenNoQuantizationMethodAttr) {
+  // Function containing a simple `TF::XlaCallModuleOp` that is missing the
+  // "_quantization_method" attribute.
+  constexpr absl::string_view kXlaCallModuleOpWithoutQuantizationMethodAttr =
+      R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_dot_general_fn_1,
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithoutQuantizationMethodAttr);
+  ASSERT_TRUE(module_op);
+
+  FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  // Test that `GetQuantizationMethodOrDefault` returns the default instance.
+  const Method method = GetQuantizationMethodOrDefault(*xla_call_module_op);
+  EXPECT_TRUE(MessageDifferencer::Equals(method, Method::default_instance()));
+}
+
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/python/testing.py b/tensorflow/compiler/mlir/quantization/common/python/testing.py
index 78eb2409c70f89..211e08df7d9e4b 100644
--- a/tensorflow/compiler/mlir/quantization/common/python/testing.py
+++ b/tensorflow/compiler/mlir/quantization/common/python/testing.py
@@ -1,5 +1,3 @@
-"""Common testing utilities for quantization libraries."""
-
 # Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Common testing utilities for quantization libraries."""
 import itertools
+import os
 from typing import Any, Mapping, Sequence
 
 
@@ -36,3 +36,34 @@ def parameter_combinations(
     for curr in itertools.product(*parameters.values()):
       real_parameters.append(dict(zip(keys, curr)))
   return real_parameters
+
+
+def get_dir_size(path: str = '.') -> int:
+  """Get the total size of files and sub-directories under the path.
+
+  Args:
+    path: Path of a directory or a file to calculate the total size.
+
+  Returns:
+    Total size of the directory or a file.
+  """
+  total = 0
+  for root, _, files in os.walk(path):
+    for filename in files:
+      total += os.path.getsize(os.path.join(root, filename))
+  return total
+
+
+def get_size_ratio(path_a: str, path_b: str) -> float:
+  """Return the size ratio of the given paths.
+
+  Args:
+    path_a: Path of a directory or a file to be the nominator of the ratio.
+    path_b: Path of a directory or a file to be the denominator of the ratio.
+
+  Returns:
+    Ratio of size of path_a / size of path_b.
+  """
+  size_a = get_dir_size(path_a)
+  size_b = get_dir_size(path_b)
+  return size_a / size_b
diff --git a/tensorflow/compiler/mlir/quantization/common/python/testing_test.py b/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
index 3366959456d5fe..9549e10898cedb 100644
--- a/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
+++ b/tensorflow/compiler/mlir/quantization/common/python/testing_test.py
@@ -37,5 +37,27 @@ def test_parameter_combinations(self):
     self.assertIn({'shapes': [3, None], 'has_bias': False}, combinations)
 
 
+class FileSizeTestCase(test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+
+    self.path_a = self.create_tempdir('dir_a').full_path
+    self.create_tempfile(file_path='dir_a/w.txt', content='abcd')
+
+    self.path_b = self.create_tempdir('dir_b').full_path
+    self.create_tempfile(file_path='dir_b/x.txt', content='1234')
+    self.create_tempfile(file_path='dir_b/y.txt', content='56')
+    self.create_tempfile(file_path='dir_b/z.txt', content='78')
+
+  def test_get_dir_size(self):
+    self.assertEqual(testing.get_dir_size(self.path_a), 4)
+    self.assertEqual(testing.get_dir_size(self.path_b), 8)
+
+  def test_get_size_ratio(self):
+    self.assertEqual(testing.get_size_ratio(self.path_a, self.path_b), 0.5)
+    self.assertEqual(testing.get_size_ratio(self.path_b, self.path_a), 2.0)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
index d41a189519fd6d..7c68bb0f0c4b04 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/BUILD
@@ -31,10 +31,12 @@ cc_library(
         ":quantization_config",
         ":quantization_interfaces_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/tools/optimize:quantization_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -52,6 +54,7 @@ tf_cc_test(
     srcs = ["quantization_driver_test.cc"],
     deps = [
         ":quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
@@ -62,6 +65,8 @@ tf_cc_test(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
index 962c6656f55b65..327d109946e031 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -47,39 +46,44 @@ limitations under the License.
 
 namespace mlir {
 namespace quant {
-
 namespace {
-// This is used to identify an operand or result of an op. The second element
-// of this pair is the index of the operand or result.
-using OpValue = std::pair<mlir::Operation*, int>;
+
+constexpr int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
 
 // Uses the type of `value` to set the initial state of the index-th result if
 // `as_result` is true or index-th operand if `as_result` is false. The state
 // is immutable if the type is a quantized type. Returns the index of this
 // new state in the state vector.
-void InitializeStateForValue(Operation* op, const int index, const Value value,
-                             const bool as_result,
-                             std::vector<QuantState>* states,
-                             llvm::DenseMap<Value, int>* value_to_state,
-                             llvm::DenseMap<OpValue, int>* operand_states,
-                             llvm::DenseMap<OpValue, int>* result_states) {
-  const auto [cached, inserted] = value_to_state->insert({value, 0});
+void InitializeStateForValue(
+    Operation* op, const int index, const Value value, const bool as_result,
+    std::vector<QuantState>& states,
+    DenseMap<Value, QuantizationDriver::QuantStateIndex>& value_to_state,
+    DenseMap<QuantizationDriver::OpWithOperandIndex,
+             QuantizationDriver::QuantStateIndex>& operand_states,
+    DenseMap<QuantizationDriver::OpWithResultIndex,
+             QuantizationDriver::QuantStateIndex>& result_states) {
+  const auto [cached, inserted] = value_to_state.try_emplace(value, 0);
   if (!inserted) {
-    if (as_result)
-      (*result_states)[{op, index}] = cached->second;
-    else
-      (*operand_states)[{op, index}] = cached->second;
+    if (as_result) {
+      result_states[{op, index}] = cached->second;
+    } else {
+      operand_states[{op, index}] = cached->second;
+    }
     return;
   }
-  const QuantParams params =
-      quant::QuantizedType::getQuantizedElementType(value.getType());
-  const bool immutable = !HasQuantParams(params);
-  const int next_state_index = states->size();
-  states->push_back({params, immutable});
-  if (as_result)
-    (*result_states)[{op, index}] = next_state_index;
-  else
-    (*operand_states)[{op, index}] = next_state_index;
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(value.getType());
+
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states.size();
+  states.push_back({quantized_type, immutable});
+  if (as_result) {
+    result_states[{op, index}] = next_state_index;
+  } else {
+    operand_states[{op, index}] = next_state_index;
+  }
+
   cached->second = next_state_index;
 }
 
@@ -87,32 +91,31 @@ void InitializeStateForValue(Operation* op, const int index, const Value value,
 
 void QuantizationDriver::InitializeArgState(const BlockArgument arg,
                                             const Value arg_value) {
-  const auto [cached, inserted] = value_to_state_.insert({arg_value, 0});
+  const auto [cached, inserted] = value_to_state_.try_emplace(arg_value, 0);
   if (!inserted) {
     arg_states_[arg] = cached->second;
     return;
   }
-  const QuantParams params =
-      quant::QuantizedType::getQuantizedElementType(arg_value.getType());
-  const bool immutable = !HasQuantParams(params);
-  const int next_state_index = states_.size();
-  states_.push_back({params, immutable});
+
+  const QuantizedType quantized_type =
+      QuantizedType::getQuantizedElementType(arg_value.getType());
+  const bool immutable = quantized_type != nullptr;
+  const QuantizationDriver::QuantStateIndex next_state_index = states_.size();
+  states_.push_back({quantized_type, immutable});
   arg_states_[arg] = next_state_index;
   cached->second = next_state_index;
 }
 
 void QuantizationDriver::InitializeOperandState(Operation* op, const int index,
                                                 const Value value) {
-  ::mlir::quant::InitializeStateForValue(op, index, value, /*as_result=*/false,
-                                         &states_, &value_to_state_,
-                                         &operand_states_, &result_states_);
+  InitializeStateForValue(op, index, value, /*as_result=*/false, states_,
+                          value_to_state_, operand_states_, result_states_);
 }
 
 void QuantizationDriver::InitializeResultState(Operation* op, const int index,
                                                const Value value) {
-  ::mlir::quant::InitializeStateForValue(op, index, value, /*as_result=*/true,
-                                         &states_, &value_to_state_,
-                                         &operand_states_, &result_states_);
+  InitializeStateForValue(op, index, value, /*as_result=*/true, states_,
+                          value_to_state_, operand_states_, result_states_);
 }
 
 std::unique_ptr<OpQuantSpec> QuantizationDriver::GetQuantSpec(Operation* op) {
@@ -133,11 +136,11 @@ bool QuantizationDriver::IsQuantized(Operation* op) {
 
 bool QuantizationDriver::SetConstantResultParams(Operation* op) {
   DenseFPElementsAttr attr;
-  const Value res = op->getResult(0);
-  if (!matchPattern(res, m_Constant(&attr))) {
+  const Value result = op->getResult(0);
+  if (!matchPattern(result, m_Constant(&attr))) {
     return false;
   }
-  // TODO(fengliuai): make storage_type_width and narrow_range configurable.
+  // TODO: b/323478683 - Make storage_type_width and narrow_range configurable.
   Type final_type;
   const auto it = optimized_weights_.find(op);
   const bool is_weight = it != optimized_weights_.end();
@@ -159,42 +162,44 @@ bool QuantizationDriver::SetConstantResultParams(Operation* op) {
     final_type = GetUniformQuantizedTypeForWeight(
         attr, /*symmetric=*/is_weight && is_signed_,
         /*num_bits=*/8, is_signed_,
-        /*narrow_range_=*/is_weight, legacy_float_scale_);
+        /*narrow_range=*/is_weight, legacy_float_scale_);
   }
-  if (const auto quant_type =
-          final_type.dyn_cast_or_null<quant::QuantizedType>()) {
-    return SetResultParams(op, 0, quant_type);
+  if (const auto quant_type = final_type.dyn_cast_or_null<QuantizedType>();
+      quant_type != nullptr) {
+    return SetResultParams(op, /*result_index=*/0, quant_type);
   }
   return false;
 }
 
-bool QuantizationDriver::SetResultParams(Operation* op, const int res_index,
-                                         const QuantParams params) {
-  auto& state = GetResultQuantState(op, res_index);
-  if (state.params == params) {
+bool QuantizationDriver::SetResultParams(Operation* op, const int result_index,
+                                         const QuantizedType quantized_type) {
+  QuantState& state = GetResultQuantState(op, result_index);
+  if (state.params == quantized_type) {
     return false;
   }
   if (!state.IsEmpty()) {
-    auto& rescales = GetResultRequantizeStates(op, res_index);
+    RequantizeStates& rescales = GetResultRequantizeStates(op, result_index);
     RequantizeState& rescale = rescales.emplace_back();
     rescale.pos = RequantizeState::ON_INPUT;
-    rescale.params = params;
+    rescale.params = quantized_type;
     return true;
   }
-  state.params = params;
-  AddUserToList(op, res_index);
+  state.params = quantized_type;
+  AddUserToList(op, result_index);
   return true;
 }
 
-QuantParams QuantizationDriver::GetBiasParams(
-    Operation* op, const int bias_index, const std::vector<int>& non_biases,
+QuantizedType QuantizationDriver::GetBiasParams(
+    Operation* op, const int bias_index,
+    const ArrayRef<int> non_bias_operand_indices,
     const AccumulatorScaleFunc func) {
   QuantState& bias_state = GetOperandQuantState(op, bias_index);
   if (!bias_state.IsEmpty()) {
     return bias_state.params;
   }
-  std::vector<QuantParams> op_types;
-  op_types.reserve(non_biases.size());
+  std::vector<QuantizedType> op_types{};
+  op_types.reserve(non_bias_operand_indices.size());
+
   int adjusted_quant_dim = -1;
   if (op->getNumOperands() > bias_index) {
     // Some kernels allow 1D bias, broadcasting it inside the kernel. In this
@@ -211,68 +216,75 @@ QuantParams QuantizationDriver::GetBiasParams(
     }
   }
 
-  for (int non_bias : non_biases) {
-    const QuantState& non_bias_type = GetOperandQuantState(op, non_bias);
-    op_types.push_back(non_bias_type.params);
+  for (const int non_bias_operand_index : non_bias_operand_indices) {
+    const QuantState& non_bias_state =
+        GetOperandQuantState(op, non_bias_operand_index);
+    op_types.push_back(non_bias_state.params);
   }
   return func(op_types, adjusted_quant_dim, legacy_float_scale_);
 }
 
-bool QuantizationDriver::SetOperandParams(Operation* op, const int index,
-                                          const QuantParams params,
+bool QuantizationDriver::SetOperandParams(Operation* op,
+                                          const int operand_index,
+                                          const QuantizedType quantized_type,
                                           const bool override) {
-  auto& state = GetOperandQuantState(op, index);
-  if (state.params == params) {
+  QuantState& state = GetOperandQuantState(op, operand_index);
+  if (state.params == quantized_type) {
     return false;
   }
 
   if (!state.IsEmpty() && !override) {
-    auto& rescales = GetOperandRequantizeStates(op, index);
+    RequantizeStates& rescales = GetOperandRequantizeStates(op, operand_index);
     for (RequantizeState& rescale : rescales) {
-      if (rescale.params == params) {
-        rescale.users.emplace_back(op, index);
+      if (rescale.params == quantized_type) {
+        rescale.users.emplace_back(op, operand_index);
         return true;
       }
     }
     RequantizeState& rescale = rescales.emplace_back();
     rescale.pos = RequantizeState::ON_OUTPUT;
-    rescale.params = params;
-    rescale.users.emplace_back(op, index);
+    rescale.params = quantized_type;
+    rescale.users.emplace_back(op, operand_index);
     return true;
   }
 
-  state.params = params;
-  AddOperandToList(op, index);
+  state.params = quantized_type;
+  AddOperandToList(op, operand_index);
   return true;
 }
 
-void QuantizationDriver::QuantizeOpResult(Operation* op, const int index,
-                                          const QuantParams params) {
+void QuantizationDriver::QuantizeOpResult(Operation* op, const int result_index,
+                                          const QuantizedType quantized_type) {
   builder_.setInsertionPointAfter(op);
-  const Value original_result = op->getResult(index);
-  QuantizeValue(original_result, params, op->getLoc());
+  const Value original_result = op->getResult(result_index);
+  QuantizeValue(original_result, quantized_type, op->getLoc());
 }
 
-void QuantizationDriver::QuantizeArg(BlockArgument arg, QuantParams params) {
+void QuantizationDriver::QuantizeArg(BlockArgument arg,
+                                     const QuantizedType quantized_type) {
   builder_.setInsertionPointToStart(arg.getOwner());
-  QuantizeValue(arg, params, builder_.getUnknownLoc());
+  QuantizeValue(arg, quantized_type, builder_.getUnknownLoc());
 }
 
-void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
-                                       Location loc) {
+void QuantizationDriver::QuantizeValue(Value value,
+                                       QuantizedType quantized_type,
+                                       const Location loc) {
   const Type expressed_type = value.getType();
-  const Type new_type = params.castFromExpressedType(expressed_type);
-  // This value isn't an expressed type (float), skip.
-  if (!new_type) return;
+  const Type new_value_type =
+      quantized_type.castFromExpressedType(expressed_type);
+  // Skip if `value` or `value`'s element type doesn't match the expressed type
+  // of `quantized_type`.
+  if (new_value_type == nullptr) return;
+
   auto quantize =
-      builder_.create<quantfork::QuantizeCastOp>(loc, new_type, value);
+      builder_.create<quantfork::QuantizeCastOp>(loc, new_value_type, value);
   auto dequantize = builder_.create<quantfork::DequantizeCastOp>(
       loc, expressed_type, quantize.getResult());
 
   // This attribute is set to distinguish the quantize ops being added by the
   // quantization pass. These ops can be removed without losing original
   // program accuracy.
-  // TODO(fengliuai): make the attribute being part of op definition.
+  // TODO: b/323478683 - Make the attribute being part of op definition.
   quantize->setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
 
   // `original_result` has a use to `quantize`, so this will replace that use
@@ -281,17 +293,18 @@ void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
   quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 }
 
-void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
-                                            RequantizeStates* states) {
-  if (states->empty()) return;
+void QuantizationDriver::RequantizeOpResult(Operation* op,
+                                            const int result_index,
+                                            RequantizeStates& states) {
+  if (states.empty()) return;
 
   builder_.setInsertionPointAfter(op);
-  Value value = op->getResult(index);
-  RequantizeState::RequantizePosition pos = states->front().pos;
+  Value value = op->getResult(result_index);
+  RequantizeState::RequantizePosition pos = states.front().pos;
   if (pos == RequantizeState::NO_REQUANTIZE) {
     return;
   }
-  for (auto& state : *states) {
+  for (const RequantizeState& state : states) {
     // Check that all requantization positions are the same for each state.
     // Unsure if this check is required.
     if (state.pos != pos) {
@@ -300,7 +313,7 @@ void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
   }
   if (pos == RequantizeState::ON_OUTPUT) {
     Operation* user = value.getUses().begin().getUser();
-    if (llvm::isa<quantfork::QuantizeCastOp>(user)) {
+    if (isa<quantfork::QuantizeCastOp>(user)) {
       // The requantize op is inserted between `quantize` and `dequantize` ops.
       value = user->getResult(0);
       builder_.setInsertionPointAfter(user);
@@ -310,12 +323,12 @@ void QuantizationDriver::RequantizeOpResult(Operation* op, const int index,
 }
 
 void QuantizationDriver::RequantizeArg(const BlockArgument arg,
-                                       RequantizeStates* states) {
+                                       RequantizeStates& states) {
   Value value = arg;
   builder_.setInsertionPointToStart(arg.getOwner());
   if (value.hasOneUse()) {
     Operation* user = value.use_begin().getUser();
-    if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+    if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
       value = q.getResult();
       builder_.setInsertionPoint(arg.getOwner(), ++Block::iterator(user));
     }
@@ -323,14 +336,13 @@ void QuantizationDriver::RequantizeArg(const BlockArgument arg,
   RequantizeValue(value, states, builder_.getUnknownLoc());
 }
 
-void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
+void QuantizationDriver::RequantizeValue(Value value, RequantizeStates& states,
                                          const Location loc) {
-  if (states->empty() ||
-      states->front().pos == RequantizeState::NO_REQUANTIZE) {
+  if (states.empty() || states.front().pos == RequantizeState::NO_REQUANTIZE) {
     return;
   }
-  if (states->front().pos == RequantizeState::ON_INPUT) {
-    auto& state = states->front();
+  if (states.front().pos == RequantizeState::ON_INPUT) {
+    RequantizeState& state = states.front();
     const Type expressed_type = value.getType();
     // The value needs to be requantized. A Quantize op will be created to use
     // it as the operand and replace its uses.
@@ -350,7 +362,7 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
   if (!value.hasOneUse()) {
     return;
   }
-  auto dequant_op = llvm::dyn_cast_or_null<quantfork::DequantizeCastOp>(
+  auto dequant_op = dyn_cast_or_null<quantfork::DequantizeCastOp>(
       value.use_begin().getUser());
   if (!dequant_op) {
     return;
@@ -363,10 +375,9 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
   // Whether to replace quantization params of the first dequantize op
   // after the quantized value is produced.
   // If there is a use other than the requantize states, then we can't clobber.
-  bool clobber_first = num_uses <= states->size();
-  for (auto& state : *states) {
-    Type expressed_type =
-        quant::QuantizedType::castToExpressedType(value.getType());
+  bool clobber_first = num_uses <= states.size();
+  for (RequantizeState& state : states) {
+    Type expressed_type = QuantizedType::castToExpressedType(value.getType());
     if (!expressed_type) continue;
     // The value needs to be requantized. A Quantize op will be created to use
     // it as the operand and replace its uses.
@@ -384,8 +395,8 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
     } else {
       auto new_dequant_op = builder_.create<quantfork::DequantizeCastOp>(
           loc, dequant_op.getResult().getType(), requantize_op.getResult());
-      for (auto& op_index : state.users) {
-        op_index.first->setOperand(op_index.second, new_dequant_op.getResult());
+      for (auto [op, operand_idx] : state.users) {
+        op->setOperand(operand_idx, new_dequant_op.getResult());
       }
     }
   }
@@ -400,12 +411,12 @@ void QuantizationDriver::RequantizeValue(Value value, RequantizeStates* states,
 // - use the single input if it is ready, or,
 // - use the single output if it is ready, or,
 // - use the first ready one in the collection.
-QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
+QuantizedType QuantizationDriver::GetQuantParamsForSameScaleConstraint(
     Operation* op) {
   // Two vector to collect Non-empty operands and results states.
   std::vector<QuantState*> mutable_states, immutable_states;
   for (int i = 0; i < op->getNumOperands(); ++i) {
-    auto& state = GetOperandQuantState(op, i);
+    QuantState& state = GetOperandQuantState(op, i);
     if (state.immutable) {
       immutable_states.push_back(&state);
     } else if (!state.IsEmpty()) {
@@ -422,7 +433,7 @@ QuantParams QuantizationDriver::GetQuantParamsForSameScaleConstraint(
   }
 
   for (int i = 0; i < op->getNumResults(); ++i) {
-    auto& state = GetResultQuantState(op, i);
+    QuantState& state = GetResultQuantState(op, i);
     if (state.immutable) {
       immutable_states.push_back(&state);
     } else if (!state.IsEmpty()) {
@@ -476,14 +487,11 @@ void QuantizationDriver::PreprocessConstantOps() {
 
     // The following loop will change the value uses, thus we cache all the uses
     // needs to be changed.
-    llvm::SmallVector<std::pair<Operation*, int>> uses;
-    for (auto& use : value.getUses()) {
+    SmallVector<std::pair<Operation*, int>> uses;
+    for (OpOperand& use : value.getUses()) {
       uses.push_back({use.getOwner(), use.getOperandNumber()});
     }
-    for (const auto& indexed_use : llvm::enumerate(uses)) {
-      Operation* user = indexed_use.value().first;
-      const int operand_num = indexed_use.value().second;
-
+    for (const auto [user, operand_num] : uses) {
       const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(user);
       const std::unique_ptr<OpQuantScaleSpec> scale_spec =
           GetQuantScaleSpec(user);
@@ -493,9 +501,9 @@ void QuantizationDriver::PreprocessConstantOps() {
       // other values. So any constants which are not bias, an operand of an
       // op with same scale requirements, and haven't been quantized are
       // weights.
-      if (biases.find(operand_num) == biases.end() &&
+      if (!biases.contains(operand_num) &&
           !scale_spec->has_same_scale_requirement &&
-          !llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+          !dyn_cast<quantfork::QuantizeCastOp>(user)) {
         // Needs to scan the content of weights to get the quantization
         // parameters if there are no quantization parameters (FakeQuant ops).
         // For this case, the weight will not be duplicated.
@@ -511,9 +519,9 @@ void QuantizationDriver::PreprocessConstantOps() {
         // other values. Duplicate this constant in case it is shared by
         // different users.
         if (uses.size() > 1) {
-          auto new_cst =
+          auto new_constant_op =
               builder_.create<arith::ConstantOp>(cst.getLoc(), cst.getValue());
-          user->setOperand(operand_num, new_cst);
+          user->setOperand(operand_num, new_constant_op);
         }
       }
     }
@@ -521,13 +529,13 @@ void QuantizationDriver::PreprocessConstantOps() {
 }
 
 void QuantizationDriver::SetupAllStates() {
-  for (auto arg : fn_.getArguments()) {
+  for (BlockArgument arg : fn_.getArguments()) {
     args_.push_back(arg);
     Value value = arg;
     // If the argument is quantized, it should only has one user.
     if (arg.hasOneUse()) {
       Operation* user = value.use_begin().getUser();
-      if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+      if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
         value = q.getResult();
       }
     }
@@ -543,29 +551,29 @@ void QuantizationDriver::SetupAllStates() {
 
     for (int i = 0; i < op->getNumOperands(); ++i) {
       Value operand = op->getOperand(i);
-      if (auto* inst = operand.getDefiningOp()) {
+      if (Operation* inst = operand.getDefiningOp()) {
         // If the operand comes from a `quantfork::DequantizeCastOp`, we use
         // the quantized input of this `quantfork::DequantizeCastOp` to set the
         // state.
-        if (auto dq = llvm::dyn_cast<quantfork::DequantizeCastOp>(inst)) {
+        if (auto dq = dyn_cast<quantfork::DequantizeCastOp>(inst)) {
           operand = dq.getArg();
         }
       }
       InitializeOperandState(op, i, operand);
     }
 
-    for (int res = 0; res < op->getNumResults(); ++res) {
-      Value result = op->getResult(res);
+    for (int i = 0; i < op->getNumResults(); ++i) {
+      Value result = op->getResult(i);
       // If the result has been quantized, it should only be used by a
       // `quantfork::QuantizeCastOp`. For this case, we uses the quantized
       // result to create the state and mark it immutable.
       if (result.hasOneUse()) {
         Operation* user = result.use_begin().getUser();
-        if (auto q = llvm::dyn_cast<quantfork::QuantizeCastOp>(user)) {
+        if (auto q = dyn_cast<quantfork::QuantizeCastOp>(user)) {
           result = q.getResult();
         }
       }
-      InitializeResultState(op, res, result);
+      InitializeResultState(op, i, result);
     }
   });
 }
@@ -577,7 +585,7 @@ arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
   }
   OpBuilder builder(op->getContext());
   builder.setInsertionPointAfter(op);
-  arith::ConstantOp new_op = llvm::cast<arith::ConstantOp>(builder.clone(*op));
+  arith::ConstantOp new_op = cast<arith::ConstantOp>(builder.clone(*op));
   target_op->getOpOperand(operand_index).set(new_op.getResult());
   InitializeOperandState(target_op, operand_index, new_op.getResult());
   InitializeResultState(new_op, 0, new_op.getResult());
@@ -585,13 +593,13 @@ arith::ConstantOp QuantizationDriver::DuplicateConstantOpIfNeeded(
 }
 
 bool QuantizationDriver::ShouldCheckBiasScale(
-    Operation* op, const int bias_index, const std::vector<int>& input_indices,
-    const QuantParams params, int& input_index, int& filter_index) {
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType quantized_type, int& input_index, int& filter_index) {
   // For now, restrict scale adjustment to ops with affine quantized weights,
   // and having weights and biases as constants. This currently only applies to
   // FC and Conv* ops. Restriction for the weight can be relaxed if there are
   // needs for adjusting scale of variable weights.
-  auto affine_op = llvm::dyn_cast<AffineQuantizedOpInterface>(op);
+  auto affine_op = dyn_cast<AffineQuantizedOpInterface>(op);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   if (!affine_op || !bias_op || input_indices.size() != 2) return false;
   if (!bias_op.getValue().isa<DenseFPElementsAttr>()) return false;
@@ -607,22 +615,20 @@ bool QuantizationDriver::ShouldCheckBiasScale(
     return false;
   }
 
-  const auto input_state = GetOperandQuantState(op, input_index);
-  const auto filter_state = GetOperandQuantState(op, filter_index);
+  const QuantState& input_state = GetOperandQuantState(op, input_index);
+  const QuantState& filter_state = GetOperandQuantState(op, filter_index);
   // If quantization parameter for the filter is fixed, should return it as-is.
   // Only checks ops with 8-bit input and weights, and 32-bit biases.
-  if (!(input_state.params.getStorageTypeIntegralWidth() == 8 &&
-        filter_state.params.getStorageTypeIntegralWidth() == 8 &&
-        params.getStorageTypeIntegralWidth() == 32)) {
-    return false;
-  }
-  return true;
+  return input_state.params.getStorageTypeIntegralWidth() == 8 &&
+         filter_state.params.getStorageTypeIntegralWidth() == 8 &&
+         quantized_type.getStorageTypeIntegralWidth() == 32;
 }
 
 bool QuantizationDriver::SetBiasParamsWithAdjustments(
-    Operation* op, const int bias_index, const std::vector<int>& input_indices,
-    const QuantParams params) {
+    Operation* op, const int bias_index, ArrayRef<int> input_indices,
+    const QuantizedType params) {
   bool changed = false;
+
   int input_index;
   int filter_index;
   if (!ShouldCheckBiasScale(op, bias_index, input_indices, params, input_index,
@@ -630,8 +636,8 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
     return SetOperandParams(op, bias_index, params);
   }
 
-  quant::QuantState input_state = GetOperandQuantState(op, input_index);
-  quant::QuantState filter_state = GetOperandQuantState(op, filter_index);
+  QuantState input_state = GetOperandQuantState(op, input_index);
+  QuantState filter_state = GetOperandQuantState(op, filter_index);
   auto bias_op = op->getOperand(bias_index).getDefiningOp<arith::ConstantOp>();
   const double input_scale =
       input_state.params.cast<UniformQuantizedType>().getScale();
@@ -639,15 +645,15 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
   auto bias_values = bias_op.getValue().cast<DenseFPElementsAttr>();
   // Restrict maximum absolute value of bias within INT_MAX / 2, to make some
   // room for accumulator.
-  const int32_t kBiasMax = std::numeric_limits<int32_t>::max() / 2;
-  if (auto bias_params = params.dyn_cast<UniformQuantizedType>()) {
+  if (auto bias_quantized_type = params.dyn_cast<UniformQuantizedType>();
+      bias_quantized_type != nullptr) {
     double bias_half_range = 0.0f;
     for (auto bias : bias_values.getValues<APFloat>()) {
       if (bias_half_range < std::abs(bias.convertToFloat())) {
         bias_half_range = std::abs(bias.convertToFloat());
       }
     }
-    if (bias_half_range / bias_params.getScale() < kBiasMax) {
+    if (bias_half_range / bias_quantized_type.getScale() < kBiasMax) {
       return SetOperandParams(op, bias_index, params);
     }
     const double new_bias_scale =
@@ -659,30 +665,36 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
             bias_op->getLoc(), params.getFlags(), params.getStorageType(),
             params.getExpressedType(), new_bias_scale, 0,
             params.getStorageTypeMin(), params.getStorageTypeMax()));
-    auto filter_op = DuplicateConstantOpIfNeeded(
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
         op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
         filter_index);
     if (!filter_op) {
       return SetOperandParams(op, bias_index, params);
     }
 
-    const auto filter_param = filter_state.params.cast<UniformQuantizedType>();
+    const auto filter_quantized_type =
+        filter_state.params.cast<UniformQuantizedType>();
     changed |= SetOperandParams(
         op, filter_index,
         UniformQuantizedType::getChecked(
-            filter_op->getLoc(), filter_param.getFlags(),
-            filter_param.getStorageType(), filter_param.getExpressedType(),
-            new_bias_scale / input_scale, 0, filter_param.getStorageTypeMin(),
-            filter_param.getStorageTypeMax()),
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(),
+            new_bias_scale / input_scale, 0,
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
         /*override=*/true);
-  } else if (auto bias_params =
-                 params.dyn_cast<quant::UniformQuantizedPerAxisType>()) {
-    const auto filter_params =
+  } else if (auto bias_quantized_type =
+                 params.dyn_cast<quant::UniformQuantizedPerAxisType>();
+             bias_quantized_type != nullptr) {
+    const auto filter_quantized_type =
         filter_state.params.cast<quant::UniformQuantizedPerAxisType>();
-    std::vector<double> new_bias_scales = bias_params.getScales().vec();
-    std::vector<double> new_filter_scales = filter_params.getScales().vec();
+    std::vector<double> new_bias_scales = bias_quantized_type.getScales().vec();
+    std::vector<double> new_filter_scales =
+        filter_quantized_type.getScales().vec();
+
     bool needs_adjustment = false;
-    for (int i = 0; i < bias_params.getScales().size(); ++i) {
+    for (int i = 0; i < bias_quantized_type.getScales().size(); ++i) {
       const float abs_bias = std::abs(bias_values.getValues<float>()[i]);
       if (abs_bias / new_bias_scales[i] > kBiasMax) {
         new_bias_scales[i] = static_cast<double>(abs_bias) / kBiasMax;
@@ -698,21 +710,23 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
         quant::UniformQuantizedPerAxisType::getChecked(
             bias_op->getLoc(), params.getFlags(), params.getStorageType(),
             params.getExpressedType(), new_bias_scales,
-            bias_params.getZeroPoints(), bias_params.getQuantizedDimension(),
+            bias_quantized_type.getZeroPoints(),
+            bias_quantized_type.getQuantizedDimension(),
             params.getStorageTypeMin(), params.getStorageTypeMax()));
 
-    auto filter_op = DuplicateConstantOpIfNeeded(
+    arith::ConstantOp filter_op = DuplicateConstantOpIfNeeded(
         op->getOperand(filter_index).getDefiningOp<arith::ConstantOp>(), op,
         filter_index);
     changed |= SetOperandParams(
         op, filter_index,
         quant::UniformQuantizedPerAxisType::getChecked(
-            filter_op->getLoc(), filter_params.getFlags(),
-            filter_params.getStorageType(), filter_params.getExpressedType(),
-            new_filter_scales, filter_params.getZeroPoints(),
-            filter_params.getQuantizedDimension(),
-            filter_params.getStorageTypeMin(),
-            filter_params.getStorageTypeMax()),
+            filter_op->getLoc(), filter_quantized_type.getFlags(),
+            filter_quantized_type.getStorageType(),
+            filter_quantized_type.getExpressedType(), new_filter_scales,
+            filter_quantized_type.getZeroPoints(),
+            filter_quantized_type.getQuantizedDimension(),
+            filter_quantized_type.getStorageTypeMin(),
+            filter_quantized_type.getStorageTypeMax()),
         /*override=*/true);
   }
   return changed;
@@ -720,12 +734,12 @@ bool QuantizationDriver::SetBiasParamsWithAdjustments(
 
 // This method scans the operations in the function to setup the initial
 // states for quantization parameter propagation.
-// TODO(fengliuai): This algorithm assumes there are only one pair of
+// TODO: b/323478683 - This algorithm assumes there are only one pair of
 // `quantfork::QuantizeCastOp` and `quantfork::DequantizeCastOp` ops between two
 // quantizable ops. A sanity check should be applied.
 void QuantizationDriver::Initialize() {
   // Duplicate the bias constant, so the states can be setup correctly.
-  // TODO(fengliuai): Function definition should also be duplicated if there
+  // TODO: b/323478683 - Function definition should also be duplicated if there
   // are multiple call sites.
   PreprocessConstantOps();
 
@@ -736,21 +750,21 @@ void QuantizationDriver::Initialize() {
 // Propagates the quantization parameters to the operands, results, and biases.
 // TODO: b/323478683 - Do not use while loop to handle this logic.
 bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
-  // TODO(fengliuai): uses a typed indicator instead of a bool value.
+  // TODO: b/323478683 - Use a typed indicator instead of a bool value.
   bool changed = false;
   while (!work_list_.empty()) {
     Operation* op = work_list_.back();
     work_list_.pop_back();
 
     // This op has been quantized, so we should not consider it again.
-    if (llvm::is_contained(quantized_, op)) continue;
+    if (quantized_.contains(op)) continue;
     quantized_.insert(op);
 
-    if (auto cst = llvm::dyn_cast<arith::ConstantOp>(op)) {
+    if (auto constant_op = dyn_cast<arith::ConstantOp>(op); constant_op) {
       // If the workflow requires inferring ranges from the content
       // (post-training quantization) and it is weight (filter) and hasn't
       // been quantized, we infer the quantization parameters from the content.
-      if (infer_tensor_range_ && IsWeight(cst) && !IsQuantized(op)) {
+      if (infer_tensor_range_ && IsWeight(constant_op) && !IsQuantized(op)) {
         // The quantization parameters are determined by the content of the
         // constant.
         changed |= SetConstantResultParams(op);
@@ -761,7 +775,7 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
     std::unique_ptr<OpQuantScaleSpec> scale_spec = GetQuantScaleSpec(op);
 
     if (scale_spec->has_same_scale_requirement) {
-      const auto params = GetQuantParamsForSameScaleConstraint(op);
+      const QuantizedType params = GetQuantParamsForSameScaleConstraint(op);
       // The quantization parameters haven't been propagated to any operands
       // or results. Skip this node for now.
       if (!params) {
@@ -792,12 +806,13 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       }
 
       // Use the final state to set all the results' parameters.
-      for (int res = 0; res < op->getNumResults(); ++res)
-        if (auto type = op->getResult(res).getType().dyn_cast<ShapedType>()) {
+      for (int i = 0; i < op->getNumResults(); ++i)
+        if (auto type = op->getResult(i).getType().dyn_cast<ShapedType>();
+            type != nullptr) {
           // Without this check, it will accidentally propagate the quantization
           // information by the shared non-float-tensors.
           if (type.getElementType().isa<FloatType>())
-            changed |= SetResultParams(op, res, params);
+            changed |= SetResultParams(op, i, params);
         }
     }
 
@@ -807,8 +822,8 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
         !is_qdq_conversion_) {
       // Infer ranges from the activation ops. This is usually required for
       // the post-training quantization workflow.
-      // TODO(fengliuai): different result can have different fixed range.
-      const auto params =
+      // TODO: b/323478683 - Different result can have different fixed range.
+      const QuantizedType params =
           scale_spec->fixed_output_range_func(is_signed_, bit_width_);
       for (auto i = 0; i < op->getNumResults(); ++i) {
         // The range is null if the result has been quantized.
@@ -818,16 +833,20 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
       }
     }
 
-    const auto spec = GetQuantSpec(op);
-    for (auto& it : spec->biases_params) {
-      const auto params =
-          GetBiasParams(op, it.first, it.second.first, it.second.second);
+    const std::unique_ptr<OpQuantSpec> spec = GetQuantSpec(op);
+    for (const auto& [bias_operand_idx, non_bias_params] :
+         spec->biases_params) {
+      const auto& [non_bias_operand_indices, accumulator_scale_func] =
+          non_bias_params;
+      const QuantizedType params =
+          GetBiasParams(op, bias_operand_idx, non_bias_operand_indices,
+                        accumulator_scale_func);
       if (!params) {
         quantized_.erase(op);
         continue;
       }
-      changed |=
-          SetBiasParamsWithAdjustments(op, it.first, it.second.first, params);
+      changed |= SetBiasParamsWithAdjustments(op, bias_operand_idx,
+                                              non_bias_operand_indices, params);
     }
   }
 
@@ -836,9 +855,9 @@ bool QuantizationDriver::PropagateParamsAndReturnIfChanged() {
 
 // Finalizes the arguments and result states in the function.
 void QuantizationDriver::Finalize() {
-  for (auto arg : args_) {
-    auto& state = GetArgQuantState(arg);
-    auto& requantizes = GetArgRequantizeStates(arg);
+  for (BlockArgument arg : args_) {
+    const QuantState& state = GetArgQuantState(arg);
+    RequantizeStates& requantizes = GetArgRequantizeStates(arg);
     if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
       continue;
     }
@@ -848,25 +867,24 @@ void QuantizationDriver::Finalize() {
     }
 
     if (!requantizes.empty()) {
-      RequantizeArg(arg, &requantizes);
+      RequantizeArg(arg, requantizes);
     }
   }
 
-  for (auto it : result_states_) {
-    Operation* op = it.first.first;
-    const int res_index = it.first.second;
-    auto& state = GetResultQuantState(op, res_index);
-    auto& requantizes = GetResultRequantizeStates(op, res_index);
+  for (const auto& [op_with_result_idx, quant_state_idx] : result_states_) {
+    const auto [op, result_idx] = op_with_result_idx;
+    const QuantState& state = GetResultQuantState(op, result_idx);
+    RequantizeStates& requantizes = GetResultRequantizeStates(op, result_idx);
     if (state.IsEmpty() || (state.immutable && requantizes.empty())) {
       continue;
     }
 
     if (!state.immutable) {
-      QuantizeOpResult(op, res_index, state.params);
+      QuantizeOpResult(op, result_idx, state.params);
     }
 
     if (!requantizes.empty()) {
-      RequantizeOpResult(op, res_index, &requantizes);
+      RequantizeOpResult(op, result_idx, requantizes);
     }
   }
 }
@@ -885,7 +903,7 @@ void QuantizationDriver::Run() {
 }
 
 void ApplyQuantizationParamsPropagation(
-    const mlir::func::FuncOp func, const bool is_signed, const int bit_width,
+    const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
     const OpQuantSpecGetter op_quant_spec_getter,
     const bool infer_tensor_ranges, const bool legacy_float_scale,
@@ -897,7 +915,7 @@ void ApplyQuantizationParamsPropagation(
 }
 
 void ApplyQuantizationParamsPropagation(
-    const mlir::func::FuncOp func, const bool is_signed, const int bit_width,
+    const func::FuncOp func, const bool is_signed, const int bit_width,
     const bool disable_per_channel,
     const OpQuantSpecGetter op_quant_spec_getter,
     const OpQuantScaleSpecGetter op_quant_scale_spec_getter,
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
index 59741f48307a16..070ecb75f5db5b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver.h
@@ -17,14 +17,13 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_QUANTIZATION_LIB_QUANTIZATION_DRIVER_H_
 
 #include <memory>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -40,20 +39,16 @@ limitations under the License.
 namespace mlir {
 namespace quant {
 
-static bool HasQuantParams(QuantParams p) {
-  return p == quant::QuantizedType();
-}
-
 // The state for each op result during the quantization parameters propagation.
 struct QuantState {
   // Quantization parameters propagated to an op result.
-  QuantParams params;
+  QuantizedType params;
   // A flag indicates this state (the params) shouldn't be changed after it is
   // initialized. This flag will be set to true if the quantization parameters
   // are from the quantization-aware training.
   const bool immutable;
 
-  bool IsEmpty() { return HasQuantParams(params); }
+  bool IsEmpty() const { return params == nullptr; }
 };
 
 // The state for rescaling the propagated quantization parameters. This can be
@@ -70,7 +65,7 @@ struct RequantizeState {
   } pos = NO_REQUANTIZE;
 
   // Quantization parameters will be used to add the requantize ops.
-  QuantParams params;
+  QuantizedType params;
 
   // Avoid clobbering all uses of the value, limit to just these ops.
   SmallVector<std::pair<Operation*, int>> users;
@@ -99,15 +94,25 @@ using RequantizeStates = SmallVector<RequantizeState>;
 //
 class QuantizationDriver {
  public:
-  explicit QuantizationDriver(func::FuncOp fn, bool is_signed, int bit_width,
-                              bool disable_per_channel,
+  // Type alias of int used to access `states_`.
+  using QuantStateIndex = int;
+
+  // (op, operand index) pair.
+  using OpWithOperandIndex = std::pair<Operation*, int>;
+
+  // (op, result index) pair.
+  using OpWithResultIndex = std::pair<Operation*, int>;
+
+  explicit QuantizationDriver(func::FuncOp func_op, const bool is_signed,
+                              const int bit_width,
+                              const bool disable_per_channel,
                               OpQuantSpecGetter op_quant_spec_getter,
                               OpQuantScaleSpecGetter op_quant_scale_spec_getter,
-                              bool infer_tensor_range,
-                              bool legacy_float_scale = false,
-                              bool is_qdq_conversion = false)
-      : fn_(fn),
-        builder_(fn.getBody()),
+                              const bool infer_tensor_range,
+                              const bool legacy_float_scale = false,
+                              const bool is_qdq_conversion = false)
+      : fn_(func_op),
+        builder_(func_op.getBody()),
         is_signed_(is_signed),
         bit_width_(bit_width),
         disable_per_channel_(disable_per_channel),
@@ -130,18 +135,25 @@ class QuantizationDriver {
   // result.
   void Finalize();
 
-  llvm::SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+  SmallVector<BlockArgument, 4> GetArgs() { return args_; }
+
+  llvm::DenseMap<std::pair<mlir::Operation*, int>, int> GetResultStates() {
+    return result_states_;
+  }
+
+  DenseMap<OpWithResultIndex, QuantStateIndex> result_states_;
 
   // Returns the state of the block argument.
   QuantState& GetArgQuantState(BlockArgument arg) {
     return states_[arg_states_[arg]];
   }
 
- private:
-  // This is used to identify an operand or result of an op. The second element
-  // of this pair is the index of the operand or result.
-  using OpValue = std::pair<mlir::Operation*, int>;
+  // Returns the state of the index-th result of the op.
+  QuantState& GetResultQuantState(Operation* op, const int index) {
+    return states_[result_states_[{op, index}]];
+  }
 
+ private:
   // Duplicates the constant op if it has multiple uses, and replaces
   // target_op->operand[operand_index] with the newly created op. This also
   // replaces corresponsing quantization states.
@@ -153,13 +165,13 @@ class QuantizationDriver {
   // prevent overflow of quantized bias values. This also changes quantization
   // state of other inputs when needed.
   bool SetBiasParamsWithAdjustments(Operation* op, int bias_index,
-                                    const std::vector<int>& input_indices,
-                                    QuantParams params);
+                                    ArrayRef<int> input_indices,
+                                    QuantizedType params);
 
   // Checks preconditions to adjust bias scale.
   bool ShouldCheckBiasScale(Operation* op, int bias_index,
-                            const std::vector<int>& input_indices,
-                            QuantParams params, int& input_index,
+                            ArrayRef<int> input_indices,
+                            QuantizedType quantized_type, int& input_index,
                             int& filter_index);
 
   // Preprocesses the constants by doing the following:
@@ -187,84 +199,87 @@ class QuantizationDriver {
   bool IsQuantized(Operation* op);
 
   // Adds all the users of index-th result of op to the work list.
-  void AddUserToList(Operation* op, int index) {
+  void AddUserToList(Operation* op, const int index) {
     for (Operation* user : op->getResult(index).getUsers()) {
       work_list_.push_back(user);
     }
   }
 
   // Adds the defining op of index-th operand of op to the work list.
-  void AddOperandToList(Operation* op, int index) {
-    if (Operation* inst = op->getOperand(index).getDefiningOp()) {
-      work_list_.push_back(inst);
+  void AddOperandToList(Operation* op, const int index) {
+    if (Operation* operand_op = op->getOperand(index).getDefiningOp();
+        operand_op != nullptr) {
+      work_list_.push_back(operand_op);
     }
   }
 
   // Returns the quantization params for the bias input from the non-bias
   // operands which have their indexes in the `non_biases` vector. The returned
   // parameters are calculated by `func`.
-  QuantParams GetBiasParams(Operation* op, int bias_index,
-                            const std::vector<int>& non_biases,
-                            AccumulatorScaleFunc func);
-
-  // Sets the quantization parameters of the result to a fixed value. If any
-  // quantization parameters have been propagated, a `requantize` will happen on
-  // the input of propagated quantization.
-  bool SetResultParams(Operation* op, int index, QuantParams params);
-
-  // Sets the quantization parameters of the operand to a fixed value. If any
+  QuantizedType GetBiasParams(Operation* op, int bias_index,
+                              ArrayRef<int> non_bias_operand_indices,
+                              AccumulatorScaleFunc func);
+
+  // Sets the quantization parameters of the result to `quantized_type`. If
+  // any quantization parameters have been propagated, a requantize will
+  // happen on the input of propagated quantization. Returns `true` if internal
+  // state has been modified.
+  bool SetResultParams(Operation* op, int result_index,
+                       QuantizedType quantized_type);
+
+  // Sets the quantization parameters of the operand to `quantized_type`. If any
   // quantization parameters have been propagated, a `requantize` will happen on
   // the output of propagated quantization. When `override` is set, quantization
-  // state of the value is replaced instead of adding requantization.
-  bool SetOperandParams(Operation* op, int index, QuantParams params,
-                        bool override = false);
+  // state of the value is replaced instead of adding requantization. Returns
+  // `true` if internal state has been modified.
+  bool SetOperandParams(Operation* op, int operand_index,
+                        QuantizedType quantized_type, bool override = false);
 
   // Sets the quantization parameters of the constant result according to its
   // content.
   bool SetConstantResultParams(Operation* op);
 
-  // Inserts the Quantize and Dequantize ops for quantizing the index-th result
-  // of the op.
-  void QuantizeOpResult(Operation* op, int index, QuantParams params);
+  // Inserts the Quantize and Dequantize ops after `op`'s `index`-th result. The
+  // quantized element type for the result is `quantized_type`.
+  void QuantizeOpResult(Operation* op, int result_index,
+                        QuantizedType quantized_type);
 
-  void QuantizeArg(BlockArgument arg, QuantParams params);
+  // Inserts the Quantize and Dequantize ops after `arg`. The quantized element
+  // type for `arg` is `quantized_type`.
+  void QuantizeArg(BlockArgument arg, QuantizedType quantized_type);
 
-  // Inserts the Quantize and Dequantize ops to quantize the value and returns
-  // the Quantize op.
-  void QuantizeValue(Value value, QuantParams params, Location loc);
+  // Inserts the Quantize and Dequantize ops (i.e. QDQ) after `value`. The
+  // quantized element type for `value` is `quantized_type`.
+  void QuantizeValue(Value value, QuantizedType quantized_type, Location loc);
 
   // Inserts the Quantize ops for requantizing the index-th result of the op.
-  void RequantizeOpResult(Operation* op, int index, RequantizeStates* states);
+  void RequantizeOpResult(Operation* op, int result_index,
+                          RequantizeStates& states);
 
   // Inserts the Quantize ops for requantizing a block argument.
-  void RequantizeArg(BlockArgument arg, RequantizeStates* states);
+  void RequantizeArg(BlockArgument arg, RequantizeStates& states);
 
   // Inserts the Quantize and Dequantize ops to quantize the value and returns
   // the Quantize op.
-  void RequantizeValue(Value value, RequantizeStates* states, Location loc);
+  void RequantizeValue(Value value, RequantizeStates& states, Location loc);
 
   // Returns the quantization parameter satisfies the same scale
   // constraints for the op. Returns an empty option if this quantization
   // parameter doesn't exist.
-  QuantParams GetQuantParamsForSameScaleConstraint(Operation* op);
+  QuantizedType GetQuantParamsForSameScaleConstraint(Operation* op);
 
   // Returns the state of the index-th operand of the op.
-  QuantState& GetOperandQuantState(Operation* op, int index) {
+  QuantState& GetOperandQuantState(Operation* op, const int index) {
     return states_[operand_states_[{op, index}]];
   }
 
-  // Returns the state of the index-th result of the op.
-  QuantState& GetResultQuantState(Operation* op, int index) {
-    return states_[result_states_[{op, index}]];
-  }
-
   // Returns the states of the index-th operand of the op.
-  RequantizeStates& GetOperandRequantizeStates(Operation* op, int index) {
+  RequantizeStates& GetOperandRequantizeStates(Operation* op, const int index) {
     return rescale_states_[operand_states_[{op, index}]];
   }
 
   // Returns the states of the index-th result of the op.
-  RequantizeStates& GetResultRequantizeStates(Operation* op, int index) {
+  RequantizeStates& GetResultRequantizeStates(Operation* op, const int index) {
     return rescale_states_[result_states_[{op, index}]];
   }
 
@@ -278,10 +293,6 @@ class QuantizationDriver {
   // a new entry in the state vector.
   void InitializeArgState(BlockArgument arg, Value arg_value);
 
-  // Sets the state of index-th operand / result of op.
-  void InitializeStateForValue(Operation* op, int index, Value value,
-                               bool as_result);
-
   // Sets the state of the index-th operand of the op. If this operand is
   // cached, uses the cached result without creating new entry in the state
   // vector. Otherwise, allocate a new entry in the state vector.
@@ -301,12 +312,13 @@ class QuantizationDriver {
   // We should distinguish weights and bias constants. Biases are specified by
   // the quantization spec or are the operands of ops with same scale spec. The
   // rest are weights.
-  llvm::DenseSet<Operation*> weights_;
+  DenseSet<Operation*> weights_;
 
   // The weights require narrow_range quantization. This map collects all the
-  // weight operands defined by the op quant spec. If the value of the entry is
-  // positive, per-channel quantization is required.
-  llvm::DenseMap<Operation*, int> optimized_weights_;
+  // weight operands defined by the op quant spec. The value of each entry is
+  // the quantization dimension. If it is positive, per-channel quantization is
+  // required.
+  DenseMap<Operation*, int> optimized_weights_;
 
   // All the ops needs to propagate the quantization parameters to.
   std::vector<Operation*> work_list_;
@@ -319,18 +331,17 @@ class QuantizationDriver {
   // The map contains all the quantization parameters which are required to
   // satisfy the same operands and results constraint. The keys of this map are
   // the values from `operand_states_` and `result_state_`.
-  std::unordered_map<int, RequantizeStates> rescale_states_;
+  absl::flat_hash_map<QuantStateIndex, RequantizeStates> rescale_states_;
 
   // Maps of indexes to the propagation state vector from the ops operands,
   // results and arguments.
-  llvm::DenseMap<OpValue, int> operand_states_;
-  llvm::DenseMap<OpValue, int> result_states_;
-  llvm::DenseMap<BlockArgument, int> arg_states_;
-  llvm::DenseMap<Value, int> value_to_state_;
+  DenseMap<OpWithOperandIndex, QuantStateIndex> operand_states_;
+  DenseMap<BlockArgument, QuantStateIndex> arg_states_;
+  DenseMap<Value, QuantStateIndex> value_to_state_;
 
   // This vector is to preserve the arguments order, so the newly inserted
   // quantized ops for the arguments are deterministically ordered.
-  llvm::SmallVector<BlockArgument, 4> args_;
+  SmallVector<BlockArgument, 4> args_;
 
   OpQuantSpecGetter op_quant_spec_getter_;
   OpQuantScaleSpecGetter op_quant_scale_spec_getter_;
@@ -357,7 +368,7 @@ class QuantizationDriver {
 // Setting `infer_tensor_range` to true, to infer quantization parameters from
 // the activation ops and weight constants. This is only used for post-training
 // quantization.
-void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
+void ApplyQuantizationParamsPropagation(func::FuncOp func, bool is_signed,
                                         int bit_width, bool disable_per_channel,
                                         OpQuantSpecGetter op_quant_spec_getter,
                                         bool infer_tensor_ranges,
@@ -365,8 +376,8 @@ void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
                                         bool is_qdq_conversion);
 
 void ApplyQuantizationParamsPropagation(
-    mlir::func::FuncOp func, bool is_signed, int bit_width,
-    bool disable_per_channel, OpQuantSpecGetter op_quant_spec_getter,
+    func::FuncOp func, bool is_signed, int bit_width, bool disable_per_channel,
+    OpQuantSpecGetter op_quant_spec_getter,
     OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
     bool legacy_float_scale, bool is_qdq_conversion);
 
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
index 1942ae56b0aba4..cc82c09894b46b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_driver_test.cc
@@ -26,12 +26,16 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
@@ -80,7 +84,8 @@ std::unique_ptr<quant::OpQuantSpec> GetOpQuantSpec(
 
 TEST_F(ApplyQuantizationParamsPropagationTest,
        ConstsUsedMultipleTimesAreDuplicated) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -97,14 +102,13 @@ TEST_F(ApplyQuantizationParamsPropagationTest,
 
   int64_t num_constant_op = 0;
   main_fn.walk([&](arith::ConstantOp cst) { ++num_constant_op; });
-  // TODO: b/323478683 - This should actually be 3. Bias parameter is
-  // duplicated one extra time. Tackle this in a follow-up cl.
   EXPECT_EQ(num_constant_op, 4);
 }
 
 TEST_F(ApplyQuantizationParamsPropagationTest,
        PropagateParamsCreatesQuantState) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -120,16 +124,23 @@ TEST_F(ApplyQuantizationParamsPropagationTest,
   quantization_driver.Initialize();
   ASSERT_TRUE(quantization_driver.PropagateParamsAndReturnIfChanged());
   EXPECT_THAT(quantization_driver.GetArgs(), Not(IsEmpty()));
+
   for (const auto& arg : quantization_driver.GetArgs()) {
-    QuantState& state = quantization_driver.GetArgQuantState(arg);
-    // TODO: b/323478683 - Below should not be empty. Inspect further to see
-    // if there is a bug.
-    EXPECT_TRUE(state.IsEmpty());
+    const QuantState& state = quantization_driver.GetArgQuantState(arg);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
+  }
+  for (const auto& result : quantization_driver.GetResultStates()) {
+    Operation* op = result.first.first;
+    const int res_index = result.first.second;
+    const QuantState state =
+        quantization_driver.GetResultQuantState(op, res_index);
+    EXPECT_TRUE(isa<quant::QuantizedType>(state.params));
   }
 }
 
 TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
-  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleTFLite);
+  const OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleTFLite);
   func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
 
   auto op_quant_spec_getter = [&](Operation* op) {
@@ -146,8 +157,12 @@ TEST_F(ApplyQuantizationParamsPropagationTest, FinalizeInsertsQDQOps) {
       xla_call_module_op->getOperand(1).getDefiningOp();
   Operation* filter_qcast_op = filter_dcast_op->getOperand(0).getDefiningOp();
   ASSERT_NE(filter_qcast_op, nullptr);
-  // TODO: b/323478683 - Add check for `UniformQuantizedPerAxisType` below.
-  EXPECT_TRUE(filter_qcast_op->getResult(0).getType().isa<mlir::Type>());
+  EXPECT_TRUE(isa<quantfork::QuantizeCastOp>(filter_qcast_op));
+  EXPECT_TRUE(isa<quantfork::DequantizeCastOp>(filter_dcast_op));
+  EXPECT_TRUE(isa<UniformQuantizedPerAxisType>(filter_qcast_op->getResult(0)
+                                                   .getType()
+                                                   .cast<TensorType>()
+                                                   .getElementType()));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
index 5021805a879ef3..f6c561be98d49b 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.cc
@@ -46,10 +46,10 @@ limitations under the License.
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index d95ba49cf8e800..e1d36df58a3fd9 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -26,10 +26,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseMap.h"
@@ -54,8 +54,8 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_traits.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -86,11 +86,11 @@ inline constexpr double kNearZeroTolerance = 1.0e-6;
 using QuantParams = QuantizedType;
 using QuantSpec = QuantizationSpecs;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
-using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
+using QuantParamsForResults = llvm::SmallVector<QuantizedType, 4>;
 using AccumulatorScaleFunc =
-    std::function<QuantParams(const std::vector<QuantParams>&, int, bool)>;
+    std::function<QuantizedType(const std::vector<QuantizedType>&, int, bool)>;
 using BiasParamsMap =
-    std::unordered_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
+    absl::flat_hash_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
 // UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
 using GetFixedOutputRangeFunc = std::function<UniformQuantizedType(bool, int)>;
 // bool RequiredSameOperandsAndResultsScale(bool sign, int $bit_width)
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
index 46c069cc49011e..a1a770ff616dee 100644
--- a/tensorflow/compiler/mlir/quantization/common/test_base.h
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -28,9 +28,11 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -69,6 +71,22 @@ class QuantizationTestBase : public Test {
     return nullptr;
   }
 
+  // Convenience function that returns the first operation of type `OpT` from
+  // the `@main` function in `module_op`. Useful when testing with a text
+  // representation of a `ModuleOp` containing a single function `@main`.
+  // Returns `failure` iff there is no `@main` or no such operation is found in
+  // `@main`.
+  template <typename OpT>
+  FailureOr<OpT> FindFirstOpFromMainFunc(ModuleOp module_op) {
+    func::FuncOp main_func_op = FindMainFuncOp(module_op);
+    if (main_func_op == nullptr) return failure();
+
+    auto ops = main_func_op.getOps<OpT>();
+    if (ops.empty()) return failure();
+
+    return *ops.begin();
+  }
+
   std::unique_ptr<MLIRContext> ctx_;
   OpBuilder builder_;
 };
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
index 6c02f0d1dcbfd5..ab850c878ff0dd 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
@@ -78,6 +79,12 @@ bool IsStorageTypeI32(QuantizedType quantized_type);
 
 bool IsExpressedTypeF32(QuantizedType quantized_type);
 
+// Given a value, extract the `ElementType`.
+// `value` should be a non-null `TensorType`.
+inline Type GetElementType(const Value value) {
+  return value.getType().cast<TensorType>().getElementType();
+}
+
 // Returns true iff `type` is a uniform quantized type whose storage type is
 // 8-bit integer and expressed type is f32.
 bool IsI8F32UniformQuantizedType(Type type);
diff --git a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
index 474c378acc1e0d..e9443a667fcef3 100644
--- a/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
@@ -725,6 +726,28 @@ TEST_F(IsOpNotQuantizedTest, FalseIfOpPartiallyQuantized) {
   EXPECT_FALSE(IsOpNotQuantized(*uniform_quantize_op_itr));
 }
 
+using UniformQuantizedTypeTest = QuantizationTestBase;
+
+TEST_F(UniformQuantizedTypeTest, GetElementTypeSucceeds) {
+  constexpr absl::string_view kQuantizeOp = R"mlir(
+    func.func @quantize(%arg0: tensor<2xf32>) -> tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>> {
+      %0 = stablehlo.uniform_quantize %arg0 : (tensor<2xf32>) -> tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+      return %0 : tensor<2x!quant.uniform<i8:f32, 1.000000e+00:0>>
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kQuantizeOp);
+  ASSERT_TRUE(module_op);
+
+  auto func_op = module_op->lookupSymbol<func::FuncOp>("quantize");
+  ASSERT_THAT(func_op, NotNull());
+
+  auto uniform_quantize_op =
+      *func_op.getOps<::mlir::stablehlo::UniformQuantizeOp>().begin();
+  Value result = uniform_quantize_op.getResult();
+  EXPECT_THAT(GetElementType(result), NotNull());
+}
+
 }  // namespace
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index c6be5b32248221..f6b1d8ac9f3493 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -48,7 +48,9 @@ cc_library(
     srcs = [
         "passes/convert_func_to_bfloat16.cc",
         "passes/convert_xla_call_module_op_to_bfloat16.cc",
-        "passes/fold_constant_transpose_pass.cc",
+        "passes/defer_activation_transpose.cc",
+        "passes/fold_constant_transpose.cc",
+        "passes/insert_weight_param.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
@@ -56,7 +58,6 @@ cc_library(
         "passes/optimize_graph.cc",
         "passes/post_quantize.cc",
         "passes/prepare_quantize.cc",
-        "passes/prepare_quantize_hybrid.cc",
         "passes/quantize.cc",
         "passes/quantize_composite_functions.cc",
         "passes/quantize_weight.cc",
@@ -93,6 +94,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:permutation",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -112,6 +114,7 @@ cc_library(
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
@@ -154,8 +157,10 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
@@ -166,6 +171,8 @@ cc_library(
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -327,10 +334,10 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:chlo_legalize_to_hlo",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
         "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
         "@stablehlo//:chlo_ops",
@@ -514,6 +521,7 @@ cc_library(
         ":quantization_config_proto_cc",
         ":stablehlo_test_passes_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:post_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pre_calibration",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -524,7 +532,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -595,6 +602,7 @@ tf_cc_test(
     deps = [
         ":stablehlo_type_utils",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@stablehlo//:stablehlo_ops",
@@ -756,6 +764,7 @@ tf_cc_binary(
         ":test_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index 7a36ad58dc34a4..77629c7719bf44 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -111,13 +111,32 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":graph_def",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "permutation",
+    hdrs = ["permutation.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "permutation_test",
+    srcs = ["permutation_test.cc"],
+    deps = [
+        ":permutation",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -127,10 +146,16 @@ cc_library(
     hdrs = ["saved_model_export.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":io",
         ":pass_pipeline",
+        ":saved_model_import",
         ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:unfreeze_constants",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
@@ -140,6 +165,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -150,6 +176,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -184,15 +211,26 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":types",
+        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -261,6 +299,7 @@ tf_cc_test(
     name = "pre_calibration_test",
     srcs = ["pre_calibration_test.cc"],
     deps = [
+        ":config",
         ":pre_calibration",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
@@ -276,6 +315,26 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "report",
+    srcs = ["report.cc"],
+    hdrs = ["report.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "report_test",
+    srcs = ["report_test.cc"],
+    deps = [
+        ":report",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "context",
     srcs = [],
@@ -357,3 +416,36 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
     ],
 )
+
+cc_library(
+    name = "weight_only_ptq",
+    srcs = ["weight_only_ptq.cc"],
+    hdrs = ["weight_only_ptq.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":component",
+        ":context",
+        ":pass_pipeline",
+        ":saved_model_export",
+        ":saved_model_import",
+        ":types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 90afbe53209347..5783ffddd4f050 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -25,14 +25,14 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:graph_def",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
-        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -44,28 +44,18 @@ cc_library(
     deps = [
         ":representative_dataset",
         ":statistics",
-        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:component",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_export",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:convert_asset_args",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow/python:unfreeze_constants",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
@@ -76,7 +66,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index 494eadc8463143..ba1671ceb696ca 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -14,17 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -37,119 +33,32 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
-namespace {
 
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
+using ::stablehlo::quantization::DisableDebugging;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::stablehlo::quantization::RepresentativeDatasetConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
-using ::tensorflow::MLIRImportOptions;
-using ::tensorflow::SavedModelBundle;
-using ::tensorflow::SavedModelSignatureDefsToMlirImport;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RunPasses;
-using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
-
-using ImportedMlirModuleOp =
-    std::pair<ModuleOp, std::unique_ptr<SavedModelBundle>>;
-
-// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
-//
-// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
-// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
-// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
-// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
-absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
-    const absl::string_view saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const std::vector<std::string>& signature_keys,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  MLIRImportOptions import_options;
-  import_options.upgrade_legacy = true;
-  import_options.lift_variables = false;
-  import_options.include_variables_in_initializers = true;
-
-  auto bundle = std::make_unique<SavedModelBundle>();
-
-  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
-  // called on it.
-  std::vector<std::string> exported_names = signature_keys;
-  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
-                                          absl::MakeSpan(exported_names), &ctx,
-                                          import_options, &bundle);
-  if (!module_op.status().ok()) {
-    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
-                                            module_op.status().ToString()));
-  }
-
-  return std::make_pair(module_op->release(), std::move(bundle));
-}
-
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              PassManager& pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
-      quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
-}  // namespace
 
 CalibrationComponent::CalibrationComponent(
     absl::Nonnull<MLIRContext*> ctx,
@@ -171,6 +80,13 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
     ModuleOp module_op, const absl::string_view dst_saved_model_path) {
   TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
 
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
+
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(*cloned_module_ref);
+
   // `duplicate_shape_determining_constants = false` because the
   // resulting graph of this step is not expected to be loaded on TPU.
   const ExportOptions export_opts = {
@@ -179,11 +95,11 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
       /*debug_name=*/absl::StrCat(kName, kExportStepSuffix)};
 
   TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
-                      RunExportPasses(export_opts, *ctx_, module_op));
+                      RunExportPasses(export_opts, *ctx_, *cloned_module_ref));
 
   TF_ASSIGN_OR_RETURN(ExportedModel exported_model,
                       ConvertMlirModuleToExportedModel(
-                          module_op, checkpoint_dir, function_aliases_,
+                          *cloned_module_ref, checkpoint_dir, function_aliases_,
                           {asset_file_defs.begin(), asset_file_defs.end()}));
 
   py_function_lib_->SaveExportedModel(dst_saved_model_path, exported_model,
@@ -193,35 +109,6 @@ absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
   return exported_model;
 }
 
-absl::StatusOr<ModuleOp> CalibrationComponent::ImportCalibratedSavedModel(
-    const absl::string_view calibrated_saved_model_path) {
-  // Convert the SavedModelBundle to an MLIR module.
-  TF_ASSIGN_OR_RETURN(ImportedMlirModuleOp imported_module,
-                      SavedModelToMlirModuleOp(calibrated_saved_model_path,
-                                               tags_, signature_keys_, *ctx_));
-  ModuleOp module_op = imported_module.first;
-
-  UpdateFunctionAliases(function_aliases_, module_op);
-
-  // Collect the names of the functions that have aliases so that they may not
-  // be inlined.
-  absl::flat_hash_set<std::string> aliased_function_names;
-  absl::c_for_each(function_aliases_, [&](const auto& aliases) {
-    return aliased_function_names.insert(aliases.first);
-  });
-
-  // Freezing is required again since variables might have been produced
-  // during the pre-calibration step. `is_inliner_run = false` to prevent the
-  // functions lifted for quantization from being inlined.
-  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
-      /*mlir_dump_file_prefix=*/kName, /*is_inliner_run=*/false,
-      /*noinline_functions=*/aliased_function_names, module_op, ctx_,
-      imported_module.second == nullptr ? nullptr
-                                        : imported_module.second->GetSession(),
-      /*run_tf_to_stablehlo=*/false, /*deserialize_xla_call_module=*/true));
-  return module_op;
-}
-
 absl::StatusOr<ModuleOp> CalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
   // Exports the pre-calibrated model to SavedModel.
@@ -251,23 +138,14 @@ absl::StatusOr<ModuleOp> CalibrationComponent::Run(
       /*force_graph_mode_calibration=*/true, representative_dataset_file_map);
 
   if (absl::Status status = AddCalibrationStatistics(
-          *exported_model.mutable_graph_def(), config.calibration_options(),
-          *py_function_lib_);
+          module_op, config.calibration_options(), *py_function_lib_);
       !status.ok()) {
     LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
                     "values. Parts of the graph are not quantized. "
                  << status;
   }
 
-  // Exports the calibrated model with statistics attached to the graph.
-  TF_ASSIGN_OR_RETURN(const std::string calibrated_saved_model_path,
-                      CreateTmpDir());
-  py_function_lib_->SaveExportedModel(calibrated_saved_model_path,
-                                      exported_model, src_saved_model_path_,
-                                      tags_, signature_def_map_);
-
-  // Imports the calibrated saved model back to `ModuleOp`.
-  return ImportCalibratedSavedModel(calibrated_saved_model_path);
+  return module_op;
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
index 22160a8820dfcd..39f4ca8449ae05 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.cc
@@ -19,21 +19,19 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 
 namespace stablehlo::quantization {
 namespace {
 
 using ::stablehlo::quantization::CalibrationOptions;
-using ::tensorflow::GraphDef;
-using ::tensorflow::NodeDef;
 using ::tensorflow::calibrator::CalibrationStatistics;
 using ::tensorflow::calibrator::CalibratorSingleton;
 using ::tensorflow::quantization::PyFunctionLibrary;
@@ -41,13 +39,12 @@ using ::tensorflow::quantization::PyFunctionLibrary;
 }  // namespace
 
 absl::Status AddCalibrationStatistics(
-    GraphDef& graph_def, const CalibrationOptions& calibration_options,
+    mlir::ModuleOp module_op, const CalibrationOptions& calibration_options,
     const PyFunctionLibrary& py_function_library) {
   absl::Status status = absl::OkStatus();
-  MutateNodeDefs(graph_def, [&py_function_library, &calibration_options,
-                             &status](NodeDef& node_def) {
-    if (node_def.op() != "CustomAggregator") return;
-    const std::string& id = node_def.attr().at("id").s();
+  module_op.walk([&py_function_library, &calibration_options,
+                  &status](mlir::TF::CustomAggregatorOp aggregator_op) {
+    mlir::StringRef id = aggregator_op.getId();
     std::optional<CalibrationStatistics> statistics =
         CalibratorSingleton::GetStatistics(id);
     if (statistics == std::nullopt) {
@@ -63,8 +60,9 @@ absl::Status AddCalibrationStatistics(
                                                       calibration_options);
     CalibratorSingleton::ClearData(id);
 
-    (*node_def.mutable_attr())["min"].set_f(min_value);
-    (*node_def.mutable_attr())["max"].set_f(max_value);
+    mlir::OpBuilder builder(aggregator_op);
+    aggregator_op->setAttr("min", builder.getF32FloatAttr(min_value));
+    aggregator_op->setAttr("max", builder.getF32FloatAttr(max_value));
   });
   return status;
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
index 0069692381b6d5..9b67f22a2dac72 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h
@@ -16,10 +16,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_STATISTICS_H_
 
 #include "absl/status/status.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
 
 namespace stablehlo::quantization {
 
@@ -28,7 +27,7 @@ namespace stablehlo::quantization {
 // respectively. `calibration_options` provides the strategy to retrieve min and
 // max values.
 absl::Status AddCalibrationStatistics(
-    tensorflow::GraphDef& graph_def,
+    mlir::ModuleOp module_op,
     const stablehlo::quantization::CalibrationOptions& calibration_options,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index 679e1f8754be9b..0f9932d053cb4d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -14,12 +14,190 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
+#include <utility>
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
 namespace stablehlo::quantization {
+namespace {
+
+// Populate `CalibrationOptions` with default fields.
+void PopulateDefaultCalibrationOptions(QuantizationConfig& quant_config) {
+  if (!quant_config.has_calibration_options() ||
+      quant_config.calibration_options().calibration_method() ==
+          CalibrationOptions::CALIBRATION_METHOD_UNSPECIFIED) {
+    quant_config.mutable_calibration_options()->set_calibration_method(
+        CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
+  }
+  switch (quant_config.calibration_options().calibration_method()) {
+    case CalibrationOptions::CALIBRATION_METHOD_MIN_MAX:
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX:
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .min_percentile() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_min_percentile(0.001);
+      }
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .max_percentile() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_max_percentile(99.999);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    case CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC:
+      if (quant_config.calibration_options()
+              .calibration_parameters()
+              .initial_num_bins() == 0) {
+        quant_config.mutable_calibration_options()
+            ->mutable_calibration_parameters()
+            ->set_initial_num_bins(256);
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+// Returns a default `QuantizationSpec` for performing static-range PTQ on all
+// ops.
+//
+// In textproto, the spec corresponds to:
+//
+// {
+//   {matcher {function_name {regex: ".*"}}
+//   {method {static_range_ptq {}}}
+// }
+QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
+  QuantizationSpec spec{};
+  // Default for all ops.
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      preset.enable_full_int_quantization() ? ".*" : "^.*(conv|dot|gather).*");
+  spec.mutable_method()->mutable_static_range_ptq();
+
+  return spec;
+}
+
+// Returns a `QuantizationSpec` for performing static-range PTQ on the
+// convolution quantizable unit family. Enables per-channel quantization for
+// weights, on the channel dimension.
+//
+// In textproto, the spec corresponds to:
+//
+// {
+//   {matcher {function_name {regex: "composite_conv.*"}}}
+//   {method {static_range_ptq
+//     {input_quantized_types {
+//       key: 1,
+//       value {dimension_specs {dimension: 3}}}}
+//   }}
+// }
+QuantizationSpec GetStaticRangePtqSpecForConvolution() {
+  QuantizationSpec spec{};
+
+  // Matches all convolution quantizable unit family.
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      "composite_conv.*");
+  StaticRangePtq& static_range_ptq_spec =
+      *spec.mutable_method()->mutable_static_range_ptq();
+
+  // Enable per-channel quantization for convolution weights.
+  QuantizedType conv_weight_quantized_type{};
+
+  // Assumes NHWC format, specifying the channel dimension (3) as the
+  // quantized axis.
+  conv_weight_quantized_type.mutable_dimension_specs()->set_dimension(3);
+
+  // The index of weight operands passed to lifted functions for convolution
+  // is 1.
+  static_range_ptq_spec.mutable_input_quantized_types()->try_emplace(
+      1, std::move(conv_weight_quantized_type));
+
+  return spec;
+};
+
+void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
+                                QuantizationConfig& config) {
+  // Populate with preset's representative dataset configs if the user didn't
+  // explicitly specify other representative dataset configs to the top-level
+  // `CalibrationOptions`.
+  if (config.calibration_options().representative_datasets().empty()) {
+    auto preset_datasets = preset.representative_datasets();
+    config.mutable_calibration_options()
+        ->mutable_representative_datasets()
+        ->Add(preset_datasets.begin(), preset_datasets.end());
+  }
+
+  // Create a new `QuantizationSpecs` to replace the existing one. The
+  // expansion from `StaticRangePtqPreset` gets populated first and then
+  // user-provided explicit `QuantizationSpec`s will be appended.
+  QuantizationSpecs new_specs{};
+  *new_specs.add_specs() =
+      GetDefaultStaticRangePtqSpec(/*preset=*/config.static_range_ptq_preset());
+  *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
+
+  // Append user-provided specs to override existing specs.
+  const QuantizationSpecs& previous_specs = config.specs();
+  new_specs.mutable_specs()->Add(previous_specs.specs().begin(),
+                                 previous_specs.specs().end());
+
+  config.mutable_specs()->Swap(&new_specs);
+}
+
+}  // namespace
+
+QuantizationConfig ExpandPresets(const QuantizationConfig& config) {
+  QuantizationConfig new_config = config;
+
+  // Update the `new_config` with each preset's expansions.
+  switch (config.preset_case()) {
+    case QuantizationConfig::kStaticRangePtqPreset:
+      ExpandStaticRangePtqPreset(config.static_range_ptq_preset(), new_config);
+      break;
+    default:
+      // Preset has not been specified. The expansion is a no-op.
+      break;
+  }
+
+  return new_config;
+}
 
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   QuantizationConfig config = user_provided_config;
 
+  PopulateDefaultCalibrationOptions(config);
+
   PipelineConfig& pipeline_config = *config.mutable_pipeline_config();
   if (!pipeline_config.has_unpack_quantized_types()) {
     pipeline_config.set_unpack_quantized_types(true);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
index 20b9efa4a60fa0..5dc4554d784c92 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -24,6 +24,23 @@ namespace stablehlo::quantization {
 QuantizationConfig PopulateDefaults(
     const QuantizationConfig& user_provided_config);
 
+// Returns a copy of `QuantizationConfig` where presets are expanded and
+// transformed into other fields in `QuantizationConfig`.
+//
+// The expansion rules are as follows:
+// * StaticRangePtqPreset
+//   - The preset's `representative_datasets` field will be transferred to
+//   `QuantizationConfig.calibration_options.representative_datasets`, unless
+//   the user explicitly provided representative dataset configs to
+//   `calibration_options`. In that case, the explicit configs take precedence
+//   and the preset's configs are ignored.
+//   - For `QuantizationSpecs`, the expanded `QuantizationSpec`s will be
+//   populated first and user-provided `QuantizationSpec`s, if any, will be
+//   appended. This expresses the fact that user-provided specs take precedence.
+// * Preset unspecified
+//   - No-op.
+QuantizationConfig ExpandPresets(const QuantizationConfig& config);
+
 }  // namespace stablehlo::quantization
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
index 5912788bddf96b..e3f2bfde3d10c3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -14,12 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace stablehlo::quantization {
 namespace {
 
+using ::testing::Eq;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+
 TEST(PopulateDefaultsTest, PopulateDefaultsForEmptyConfig) {
   QuantizationConfig config{};
 
@@ -37,5 +42,237 @@ TEST(PopulateDefaultsTest, PopulateDefaultsForConfigWithUnpackQuantizedTypes) {
   EXPECT_FALSE(new_config.pipeline_config().unpack_quantized_types());
 }
 
+TEST(PopulateDefaultsTest, DefaultCalibrationOptionsPopulated) {
+  QuantizationConfig config{};
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_MIN_MAX));
+}
+
+TEST(PopulateDefaultsTest,
+     DefaultCalibrationOptionsPopulatedForUnspecifiedMethod) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_UNSPECIFIED);
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_MIN_MAX));
+}
+
+TEST(PopulateDefaultsTest, ExplicitCalibrationOptionsNotOverridden) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX);
+  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
+      512);
+
+  // Test that if the user explicitly provided `calibration_options`, it is not
+  // overridden.
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_AVERAGE_MIN_MAX));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(512));
+}
+
+TEST(PopulateDefaultsTest, DefaultNumbersPopulatedForPartOfCalibrationOptions) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE);
+  calibration_options.mutable_calibration_parameters()->set_initial_num_bins(
+      512);
+
+  // Test that if the user explicitly provided part of the
+  // `calibration_options`, it is not overridden, rest of the data are default.
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(new_config.calibration_options().calibration_method(),
+              Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_PERCENTILE));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(512));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .min_percentile(),
+              Eq(0.001f));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .max_percentile(),
+              Eq(99.999f));
+}
+
+TEST(PopulateDefaultsTest,
+     DefaultNumbersPopulatedForCalibrationOptionsOfHistogramMseBruteforce) {
+  QuantizationConfig config{};
+  CalibrationOptions& calibration_options =
+      *config.mutable_calibration_options();
+  calibration_options.set_calibration_method(
+      CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE);
+
+  const QuantizationConfig new_config = PopulateDefaults(config);
+  EXPECT_THAT(
+      new_config.calibration_options().calibration_method(),
+      Eq(CalibrationOptions::CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .initial_num_bins(),
+              Eq(256));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .min_percentile(),
+              Eq(0.0f));
+  EXPECT_THAT(new_config.calibration_options()
+                  .calibration_parameters()
+                  .max_percentile(),
+              Eq(0.0f));
+}
+
+TEST(ExpandPresetsTest, ExpandUnspecifiedPreset) {
+  QuantizationConfig config{};
+  const QuantizationConfig new_config = ExpandPresets(config);
+
+  // Test that nothing has been changed.
+  EXPECT_FALSE(new_config.has_specs());
+  EXPECT_FALSE(new_config.has_calibration_options());
+  EXPECT_FALSE(new_config.has_pipeline_config());
+}
+
+TEST(ExpandPresetsTest, ExpandStaticRangePtqEnableFullIntquantization) {
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& default_spec = new_config.specs().specs(0);
+  EXPECT_THAT(default_spec.matcher().function_name().regex(), StrEq(".*"));
+  EXPECT_TRUE(default_spec.method().has_static_range_ptq());
+
+  // Test that the expansion for convolution ops is done.
+  const QuantizationSpec& conv_spec = new_config.specs().specs(1);
+  EXPECT_THAT(conv_spec.matcher().function_name().regex(),
+              StrEq("composite_conv.*"));
+  ASSERT_TRUE(conv_spec.method().has_static_range_ptq());
+
+  const StaticRangePtq& srq_spec = conv_spec.method().static_range_ptq();
+  ASSERT_THAT(srq_spec.input_quantized_types(), SizeIs(1));
+  ASSERT_TRUE(srq_spec.input_quantized_types().contains(1));
+
+  EXPECT_THAT(
+      srq_spec.input_quantized_types().at(1).dimension_specs().dimension(),
+      Eq(3));
+
+  // Test that representative dataset config has been transferred to the
+  // `CalibrationOptions`.
+  ASSERT_THAT(new_config.calibration_options().representative_datasets(),
+              SizeIs(1));
+  EXPECT_THAT(new_config.calibration_options()
+                  .representative_datasets(0)
+                  .tf_record()
+                  .path(),
+              StrEq("/test/path"));
+}
+
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetDefault) {
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& spec = new_config.specs().specs(0);
+  EXPECT_THAT(spec.matcher().function_name().regex(),
+              StrEq("^.*(conv|dot|gather).*"));
+  EXPECT_TRUE(spec.method().has_static_range_ptq());
+}
+
+TEST(ExpandPresetsTest,
+     ExpandStaticRangePtqPresetWithTopLevelRepresentativeDataset) {
+  // Test the scenario where both
+  // `config.calibration_options.representative_datasets` and
+  // `config.static_range_ptq_preset.representative_datasets` are both
+  // specified. In this case, the one set to the `calibration_options` takes
+  // precedence.
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& top_level_dataset_config =
+      *config.mutable_calibration_options()->add_representative_datasets();
+  top_level_dataset_config.mutable_tf_record()->set_path("/test/path/1");
+
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path/2");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+
+  // Test that representative dataset config has not been transferred to the
+  // `CalibrationOptions`. Top-level config takes precedence.
+  ASSERT_THAT(new_config.calibration_options().representative_datasets(),
+              SizeIs(1));
+  EXPECT_THAT(new_config.calibration_options()
+                  .representative_datasets(0)
+                  .tf_record()
+                  .path(),
+              StrEq("/test/path/1"));
+}
+
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetThenAppendExplicitSpecs) {
+  QuantizationConfig config{};
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
+
+  QuantizationSpec& user_provided_spec = *config.mutable_specs()->add_specs();
+  user_provided_spec.mutable_matcher()->mutable_function_name()->set_regex(
+      "composite_dot_general_fn_1");
+  user_provided_spec.mutable_method()->mutable_no_quantization();
+
+  // Test that the expanded `QuantizationSpec`s are populated first and then
+  // user-provided specs are appended.
+  //
+  // It should look like:
+  //
+  // specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
+  // specs {
+  //   matcher {function_name {regex: "composite_conv.*"}}
+  //   method {static_range_ptq {...}}}
+  // }
+  // specs {
+  //   matcher {function_name {regex: "composite_dot_general_fn_1"}}
+  //   method {no_quantization {}}
+  // }
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(3));
+
+  const QuantizationSpec& first_spec = new_config.specs().specs(0);
+  EXPECT_THAT(first_spec.matcher().function_name().regex(), StrEq(".*"));
+  EXPECT_TRUE(first_spec.method().has_static_range_ptq());
+
+  const QuantizationSpec& second_spec = new_config.specs().specs(1);
+  EXPECT_THAT(second_spec.matcher().function_name().regex(),
+              StrEq("composite_conv.*"));
+  EXPECT_TRUE(second_spec.method().has_static_range_ptq());
+
+  // This corresponds to `user_provided_spec`.
+  const QuantizationSpec& third_spec = new_config.specs().specs(2);
+  EXPECT_THAT(third_spec.matcher().function_name().regex(),
+              StrEq("composite_dot_general_fn_1"));
+  EXPECT_TRUE(third_spec.method().has_no_quantization());
+}
+
 }  // namespace
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
index 1ba51790de0ac9..a06c7f8ed79fb4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.cc
@@ -14,61 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 
-#include <string>
-#include <unordered_set>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/graph_def.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace stablehlo::quantization {
-namespace {
 
-using ::tensorflow::NodeDef;
-using ::tensorflow::SignatureDef;
-using ::tensorflow::quantization::DebuggerOptions;
-using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PyFunctionLibrary;
+void DisableDebugging(mlir::ModuleOp module_op) {
+  module_op.walk(
+      [](mlir::TF::DumpTensorOp dump_op) { dump_op.setEnabled(false); });
+}
 
-}  // namespace
+void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model) {
+  MutateNodeDefs(*exported_model.mutable_graph_def(),
+                 [](tensorflow::NodeDef& node_def) {
+                   if (node_def.op() == "DumpTensor") {
+                     (*node_def.mutable_attr())["enabled"].set_b(true);
+                   }
+                 });
+}
 
-void EnableDebugging(
-    ExportedModel& exported_model, const DebuggerOptions& debugger_options,
-    const PyFunctionLibrary& py_function_library,
-    const absl::string_view src_saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map) {
-  // Enable `DumpTensor` nodes in `graph_def`. DumpTensor is disabled by
-  // default to avoid logging data during calibration.
-  MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
-    if (node_def.op() == "DumpTensor") {
-      (*node_def.mutable_attr())["enabled"].set_b(true);
-    }
+void ChangeToQuantizedFilename(mlir::ModuleOp module_op) {
+  module_op.walk([](mlir::TF::DumpTensorOp dump_op) {
+    dump_op.setFileName("quantized_tensor_data.pb");
   });
-
-  if (debugger_options.debugger_type() ==
-      DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
-    // TODO: b/295139417 - Remove CustomAggregator op in unquantized dump model.
-    // TODO: b/296916287 - Create a separate function for saving unquantized
-    // dump model.
-    py_function_library.SaveExportedModel(
-        debugger_options.unquantized_dump_model_path(), exported_model,
-        src_saved_model_path, tags, signature_def_map);
-
-    // Update the `DumpTensor` ops' file name in `graph_def`.
-    MutateNodeDefs(*exported_model.mutable_graph_def(), [](NodeDef& node_def) {
-      if (node_def.op() == "DumpTensor") {
-        (*node_def.mutable_attr())["file_name"].set_s(
-            "quantized_tensor_data.pb");
-      }
-    });
-  }
 }
 
 }  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
index 6bb427ecbdf1fd..f034e4d94ee4bf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h
@@ -15,35 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_DEBUGGER_H_
 
-#include <string>
-#include <unordered_set>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace stablehlo::quantization {
 
-// Enables debugging on `exported_model` by updating the `DumpTensor` ops.
-//
-// Saves the current model to `debugger_options.unquantized_dump_model_path()`
-// if the debugger type is `DEBUGGER_TYPE_WHOLE_MODEL`. This is required because
-// in whole-model debugging mode the `DumpTensor` ops for the unquantized
-// tensors are only inserted in the unquantized model whereas `DumpTensor` ops
-// for the quantized tensors are only inserted in the quantized model. Both
-// models are required to be able to dump both quantized and unquantized tensors
-// and compare them offline.
-void EnableDebugging(
-    tensorflow::quantization::ExportedModel& exported_model,
-    const tensorflow::quantization::DebuggerOptions& debugger_options,
-    const tensorflow::quantization::PyFunctionLibrary& py_function_library,
-    absl::string_view src_saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
-        signature_def_map);
+// Disables debugging on `DumpTensor` ops.
+void DisableDebugging(mlir::ModuleOp module_op);
+
+// Enables debugging on `DumpTensor` ops.
+void EnableDebugging(tensorflow::quantization::ExportedModel& exported_model);
+
+// Changes the filename from `unquantized_tensor_data.pb` to
+// `quantized_tensor_data.pb`.
+void ChangeToQuantizedFilename(mlir::ModuleOp module_op);
 
 }  // namespace stablehlo::quantization
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 59e64d6d77d95e..ebe950c58142f6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
@@ -37,14 +38,11 @@ void AddPreCalibrationPasses(OpPassManager& pm,
                              const CalibrationOptions& calibration_options,
                              const QuantizationSpecs& quantization_specs,
                              const DebuggerConfig& debugger_config) {
-  // For models with NCHW convolution format. This pass is required because
-  // downstream pipeline handles NHWC convolution better for most cases.
-  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+  // Convert NCHW tensors to NHWC at along with extra optimizations as
+  // downstream passes perform better optimizations when dealing with NHWC
+  // formatted tensors.
+  AddProcessNchwTensorPasses(pm);
 
-  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
-  // generated as by-products after optimizing dimension numbers (e.g.
-  // NCHW->NHWC convolution conversion).
-  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
   pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
   if (debugger_config.debugger_type() !=
       DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
@@ -60,11 +58,16 @@ void AddPostCalibrationPasses(
     OpPassManager& pm, const PipelineConfig& pipeline_config,
     const StaticRangePtqPreset& static_range_ptq_preset) {
   QuantizeCompositeFunctionsPassOptions options;
+  // TODO: b/331120943 - Use QuantizationConfig instead of preset flags.
   options.enable_per_channel_quantized_weight_ =
       static_range_ptq_preset.enable_per_channel_quantized_weight();
+  options.enable_full_int_quantization_ =
+      static_range_ptq_preset.enable_full_int_quantization();
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
   options.enable_weight_only_ = false;
+
+  AddShapeLegalizationPasses(pm);
   pm.addNestedPass<func::FuncOp>(
       CreateConvertCustomAggregationOpToQuantStatsPass());
   pm.addPass(createQuantizeCompositeFunctionsPass(options));
@@ -75,6 +78,38 @@ void AddPostCalibrationPasses(
   }
 }
 
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm, const QuantizationSpecs& quantization_specs,
+    const PipelineConfig& pipeline_config,
+    const DebuggerConfig& debugger_config) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+  pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
+  if (debugger_config.debugger_type() !=
+      DebuggerConfig::DEBUGGER_TYPE_UNSPECIFIED) {
+    pm.addPass(CreateAddDumpTensorOpPass(debugger_config.debugger_type(),
+                                         debugger_config.log_dir_path()));
+  }
+  AddShapeLegalizationPasses(pm);
+  QuantizeCompositeFunctionsPassOptions options;
+  // For debugging purposes.
+  options.mlir_dump_file_name_ = "quantize_composite_functions";
+  options.enable_weight_only_ = true;
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+
+  // Add an inliner pass to inline quantized StableHLO functions.
+  pm.addPass(createInlinerPass());
+  if (pipeline_config.unpack_quantized_types()) {
+    AddStablehloQuantToIntPasses(pm);
+  }
+}
+
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm) {
   pm.addPass(TF::CreateXlaCallModuleDeserializationPass());
   pm.addPass(createRestoreFunctionNamePass());
@@ -119,4 +154,26 @@ void AddCallModuleSerializationPasses(OpPassManager& pm) {
   pm.addPass(TF::CreateXlaCallModuleSerializationPass());
 }
 
+void AddProcessNchwTensorPasses(OpPassManager& pm) {
+  // For models with NCHW convolution format. This pass is required because
+  // downstream pipeline handles NHWC convolution better for most cases.
+  pm.addNestedPass<func::FuncOp>(createNchwConvolutionToNhwcPass());
+
+  // Recursively push down the `stablehlo.transpose` ops for activations
+  // generated by the `NchwConvolutionToNhwc` pass.
+  pm.addNestedPass<func::FuncOp>(createDeferActivationTransposePass());
+
+  // Folds `stablehlo.constant`->`stablehlo.transpose` patterns, which is often
+  // generated as by-products after optimizing dimension numbers (e.g.
+  // NCHW->NHWC convolution conversion).
+  pm.addNestedPass<func::FuncOp>(createFoldConstantTransposePass());
+}
+
+void RegisterPassPipelines() {
+  static PassPipelineRegistration<> nchw_tensor_format_processing_pipeline(
+      /*arg=*/"stablehlo-process-nchw-tensor",
+      /*description=*/"Optimizes tensors with NCHW format.",
+      AddProcessNchwTensorPasses);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
index ef7b51aaf6096f..4f94506b6c184e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -37,6 +37,13 @@ void AddPostCalibrationPasses(
     const ::stablehlo::quantization::StaticRangePtqPreset&
         static_range_ptq_preset);
 
+// Adds passes for weight-only quantization.
+void AddWeightOnlyQuantizationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::DebuggerConfig& debugger_config);
+
 // Deserializes StableHLO functions serialized and embedded in XlaCallModuleOps.
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm);
 
@@ -54,6 +61,16 @@ void AddCallModuleSerializationPasses(OpPassManager& pm);
 // through a StableHLO <-> MHLO roundtrip to utilize the MHLOQuantToInt pass.
 void AddStablehloQuantToIntPasses(OpPassManager& pm);
 
+// Processes tensors with NCHW format (== (batch, channel, height, weight)) by
+// converting them to NHWC formats along with extra optimizations such as
+// constant folding the transpose->convolution pattern. This is useful when
+// downstream pipeline (e.g. XLA) is more optimized when accepting NHWC formats.
+void AddProcessNchwTensorPasses(OpPassManager& pm);
+
+// Registers quantization pass pipelines. This is only required when running
+// MLIR opt binaries and not required when adding passes programmatically.
+void RegisterPassPipelines();
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
new file mode 100644
index 00000000000000..35b1082b10dae9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "llvm/ADT/ArrayRef.h"  // IWYU pragma: keep; required to include the definition of ArrayRef
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"  // IWYU pragma: keep; required to include the definition of SmallVector
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Permutes `values` with `permutation`. Returns the permuted values. Sizes of
+// `values` and `permutation` must be equal, and the elements of `permutation`
+// should be less than `values.size()`.
+template <typename T,
+          typename = std::enable_if_t<std::is_default_constructible_v<T>, void>>
+SmallVector<T> Permute(const ArrayRef<T> values,
+                       const ArrayRef<int64_t> permutation) {
+  SmallVector<T> permuted_values(/*Size=*/values.size(), /*Value=*/T{});
+  for (auto [i, permutation_idx] : llvm::enumerate(permutation)) {
+    permuted_values[i] = std::move(values[permutation_idx]);
+  }
+  return permuted_values;
+}
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PERMUTATION_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc
new file mode 100644
index 00000000000000..27a7886ba38466
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+#include <cstdint>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir::quant {
+namespace {
+
+using testing::ElementsAre;
+using testing::IsEmpty;
+
+TEST(PermutationTest, PermuteEmptyArray) {
+  const SmallVector<int> permutation_result =
+      Permute<int>(SmallVector<int>{}, SmallVector<int64_t>{});
+  EXPECT_THAT(permutation_result, IsEmpty());
+}
+
+TEST(PermutationTest, PermuteOneElement) {
+  const SmallVector<int> single_element_array = {8};
+  const SmallVector<int64_t> permutation = {0};
+
+  const SmallVector<int> permutation_result =
+      Permute<int>(single_element_array, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre(8));
+}
+
+TEST(PermutationTest, PermuteFourElements) {
+  const SmallVector<int> arr = {0, 3, 1, 2};
+  // Permutation inverse of {0, 3, 1, 2}.
+  const SmallVector<int64_t> permutation = {0, 2, 3, 1};
+
+  const SmallVector<int> permutation_result = Permute<int>(arr, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre(0, 1, 2, 3));
+}
+
+TEST(PermutationTest, PermuteFourStringElements) {
+  const SmallVector<std::string> arr = {"a", "b", "c", "d"};
+  const SmallVector<int64_t> permutation = {0, 2, 3, 1};
+
+  const SmallVector<std::string> permutation_result =
+      Permute<std::string>(arr, permutation);
+  EXPECT_THAT(permutation_result, ElementsAre("a", "c", "d", "b"));
+}
+
+}  // namespace
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
index c17c39d8783ba8..3d4d2295455a5c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -34,6 +35,8 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 using ::testing::Contains;
 using ::testing::SizeIs;
@@ -92,8 +95,11 @@ TEST_F(PreCalibrationComponentTest,
   )mlir");
   ASSERT_TRUE(module_op);
 
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_static_range_ptq_preset();
+  quantization_config = ExpandPresets(PopulateDefaults(quantization_config));
   absl::StatusOr<ModuleOp> pre_calibration_result =
-      component.Run(*module_op, QuantizationConfig());
+      component.Run(*module_op, quantization_config);
 
   EXPECT_THAT(pre_calibration_result, IsOk());
 
diff --git a/third_party/xla/xla/python/xla_extension.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
similarity index 54%
rename from third_party/xla/xla/python/xla_extension.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
index 5adc194d65f054..ef24c16dbf4acc 100644
--- a/third_party/xla/xla/python/xla_extension.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,10 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
 
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "xla/python/xla.h"
+#include <utility>
 
-extern "C" PYBIND11_EXPORT PyObject *PyInit_xla_extension() {
-  return xla::InitializeXlaExtension();
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationResult;
+
+void QuantizationReport::AddQuantizationResult(QuantizationResult&& result) {
+  *quantization_results_.add_results() = std::move(result);
 }
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
new file mode 100644
index 00000000000000..94eb47463f16c1
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// A class that manages information about `QuantizableUnit`s post-quantization,
+// internally in the form of `QuantizationUnits`. It is used to collect
+// quantization summary from a quantized `ModuleOp` and emit it in a human- and
+// machine-readable format.
+class QuantizationReport {
+ public:
+  QuantizationReport() = default;
+
+  // Adds a `QuantizationResult` to the report.
+  void AddQuantizationResult(
+      ::stablehlo::quantization::QuantizationResult&& result);
+
+  // Returns `QuantizationResults` that are registered in this report.
+  const ::stablehlo::quantization::QuantizationResults& GetQuantizationResults()
+      const {
+    return quantization_results_;
+  }
+
+ private:
+  // Quantization results that are registered in this report. A quantization
+  // result may be added manually by calling `AddQuantizationResult`.
+  ::stablehlo::quantization::QuantizationResults quantization_results_;
+};
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_REPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
new file mode 100644
index 00000000000000..f6897f7fde401d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/report_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/report.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizableUnit;
+using ::stablehlo::quantization::QuantizationResult;
+using ::stablehlo::quantization::QuantizationResults;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+
+TEST(QuantizationReportTest, GetQuantizationResultsReturnsEmptyResults) {
+  QuantizationReport report{};
+
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), IsEmpty());
+}
+
+TEST(QuantizationReportTest, AddQuantizationResult) {
+  // Construct a `QuantizationResult` to add, representing a unit named
+  // `quantized_my_function` that is not quantized.
+  QuantizationResult result{};
+  QuantizableUnit& quantizable_unit = *result.mutable_quantizable_unit();
+  quantizable_unit.set_name("quantized_my_function");
+
+  Method& method = *result.mutable_method();
+  method.mutable_no_quantization();
+
+  QuantizationReport report{};
+  report.AddQuantizationResult(std::move(result));
+
+  const QuantizationResults& results = report.GetQuantizationResults();
+  ASSERT_THAT(results.results(), SizeIs(1));
+
+  const QuantizationResult& first_result = results.results(0);
+  EXPECT_THAT(first_result.quantizable_unit().name(),
+              StrEq("quantized_my_function"));
+  EXPECT_TRUE(first_result.method().has_no_quantization());
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
index 7945ddf712209a..fd85bceca6f9c2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
@@ -32,11 +34,17 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
@@ -56,6 +64,8 @@ namespace {
 using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerInitType;
 using ::mlir::tf_saved_model::kTfSavedModelInitializerRestoreType;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
 using ::tensorflow::ConvertMlirToGraph;
 using ::tensorflow::FunctionDefLibrary;
@@ -67,6 +77,8 @@ using ::tensorflow::NodeDef;
 using ::tensorflow::OpRegistry;
 using ::tensorflow::SaverDef;
 using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::RunPasses;
+using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
 
 // Finds and returns the name of the node from a set of control output nodes.
 // The name should contain the string `contains`. Returns an empty string if no
@@ -114,7 +126,29 @@ std::string FindFilePrefixTensorName(const GraphDef& graph_def) {
 
 }  // namespace
 
-ExportedModel CreateExportedModel(
+absl::StatusOr<ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op) {
+  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
+  const ExportOptions export_opts = {
+      /*duplicate_shape_determining_constants=*/true,
+      /*unfreeze_constants=*/false, checkpoint_dir,
+      /*debug_name=*/
+      absl::StrCat(debug_name_prefix, kExportStepSuffix)};
+
+  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
+                      RunExportPasses(export_opts, ctx, module_op));
+
+  return ConvertMlirModuleToExportedModel(
+      module_op, checkpoint_dir, function_aliases,
+      {asset_file_defs.begin(), asset_file_defs.end()});
+}
+
+ExportedModel CreateExportedModelFromGraphDef(
     GraphDef&& graph_def, const absl::string_view init_node_name,
     const absl::string_view checkpoint_dir,
     const std::optional<SaverDef> saver_def,
@@ -222,9 +256,35 @@ absl::StatusOr<ExportedModel> ConvertMlirModuleToExportedModel(
   TF_ASSIGN_OR_RETURN(const std::optional<SaverDef> saver_def,
                       CreateSaverDef(control_ret_node_names, graph_def));
 
-  return CreateExportedModel(std::move(graph_def), init_node_name,
-                             checkpoint_dir, std::move(saver_def),
-                             function_aliases, asset_file_defs);
+  return CreateExportedModelFromGraphDef(std::move(graph_def), init_node_name,
+                                         checkpoint_dir, std::move(saver_def),
+                                         function_aliases, asset_file_defs);
+}
+
+absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
+  if (export_opts.unfreeze_constants) {
+    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
+        export_opts.checkpoint_dir, ctx, module_op));
+    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
+              << export_opts.checkpoint_dir;
+  }
+
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/
+      export_opts.debug_name,
+      /*add_passes_func=*/
+      [dup_constants = export_opts.duplicate_shape_determining_constants](
+          PassManager& pm) { AddExportPasses(pm, dup_constants); },
+      ctx, module_op));
+
+  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
+      quant::ConvertAssetArgs(module_op);
+  if (failed(asset_file_defs)) {
+    return absl::InternalError("Failed to convert asset args.");
+  }
+
+  return *asset_file_defs;
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
index 1bfd0d5113f955..357c5b0efe52d7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h
@@ -19,13 +19,18 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -56,8 +61,20 @@ struct ExportOptions {
   std::string debug_name = "stablehlo_quant";
 };
 
+// Creates `ExportedModel` from `module_op`. `module_op` goes through post
+// process passes before an `ExportModel` is created.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<tensorflow::quantization::ExportedModel> CreateExportedModel(
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view debug_name_prefix,
+    const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op);
+
 // Factory function for `ExportedModel`.
-[[nodiscard]] tensorflow::quantization::ExportedModel CreateExportedModel(
+[[nodiscard]] tensorflow::quantization::ExportedModel
+CreateExportedModelFromGraphDef(
     tensorflow::GraphDef&& graph_def, absl::string_view init_node_name,
     absl::string_view checkpoint_dir,
     std::optional<tensorflow::SaverDef> saver_def,
@@ -111,6 +128,15 @@ ConvertMlirModuleToExportedModel(
     const absl::flat_hash_map<std::string, std::string>& function_aliases,
     const std::vector<tensorflow::AssetFileDef>& asset_file_defs);
 
+// Sets up and runs the passes for exporting `module_op`. The behavior of the
+// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
+// associate the input arguments of @main and the asset file names. Asset file
+// names will be used to feed the corresponding tensors during initialization
+// upon model loading.
+// TODO: b/329206105 - Add unit tests after decomposing post processing passes.
+absl::StatusOr<SmallVector<::tensorflow::AssetFileDef>> RunExportPasses(
+    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
index e250f5314726f7..7e55644c38f886 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export_test.cc
@@ -57,10 +57,10 @@ TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
   ASSERT_TRUE(
       TextFormat::ParseFromString(R"pb(node { name: "foo" })pb", &graph_def));
 
-  const ExportedModel exported_model =
-      CreateExportedModel(std::move(graph_def), "init_node_name",
-                          "checkpoint_dir", /*saver_def=*/std::nullopt,
-                          /*function_aliases=*/{}, /*asset_file_defs=*/{});
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
+      std::move(graph_def), "init_node_name", "checkpoint_dir",
+      /*saver_def=*/std::nullopt,
+      /*function_aliases=*/{}, /*asset_file_defs=*/{});
   ASSERT_THAT(exported_model.graph_def().node(), SizeIs(1));
   EXPECT_THAT(exported_model.graph_def().node()[0].name(), StrEq("foo"));
 
@@ -72,7 +72,7 @@ TEST(CreateExportedModelTest, CreateExportedModelBasicFieldsSet) {
 }
 
 TEST(CreateExportedModelTest, CreateExportedModelWithAddedFunctionAliases) {
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
       /*saver_def=*/std::nullopt,
       /*function_aliases=*/{{"func1", "alias1"}, {"func2", "alias2"}},
@@ -93,7 +93,7 @@ TEST(CreateExportedModelTest, CreateExportedModelWithAddedAssetFileDefs) {
   ASSERT_TRUE(
       TextFormat::ParseFromString(R"pb(filename: "fname2")pb", &asset2));
 
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"",
       /*saver_def=*/std::nullopt, /*function_aliases=*/{},
       /*asset_file_defs=*/{asset1, asset2});
@@ -107,7 +107,7 @@ TEST(CreateExportedModelTest, CreateExportedModelWithAddedSaverDef) {
   ASSERT_TRUE(TextFormat::ParseFromString(
       R"pb(filename_tensor_name: "my_file")pb", &saver_def));
 
-  const ExportedModel exported_model = CreateExportedModel(
+  const ExportedModel exported_model = CreateExportedModelFromGraphDef(
       GraphDef(), /*init_node_name=*/"", /*checkpoint_dir=*/"", saver_def,
       /*function_aliases=*/{}, /*asset_file_defs=*/{});
   EXPECT_THAT(exported_model.saver_def().filename_tensor_name(), "my_file");
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
index 9c03ee6e21f4b5..a223a0b03f58a4 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.cc
@@ -14,23 +14,72 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 
+#include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::MLIRImportOptions;
+using ::tensorflow::SavedModelBundle;
+using ::tensorflow::SavedModelSignatureDefsToMlirImport;
+using ::tensorflow::quantization::PreprocessAndFreezeGraph;
+
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    const absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  MLIRImportOptions import_options;
+  import_options.upgrade_legacy = true;
+  import_options.lift_variables = false;
+  import_options.include_variables_in_initializers = true;
+
+  auto bundle = std::make_unique<SavedModelBundle>();
+
+  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
+  // called on it.
+  std::vector<std::string> exported_names = signature_keys;
+  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
+      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
+                                          absl::MakeSpan(exported_names), &ctx,
+                                          import_options, &bundle);
+  if (!module_op.status().ok()) {
+    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
+                                            module_op.status().ToString()));
+  }
+
+  return std::make_pair(module_op->release(), std::move(bundle));
+}
+
 absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
 GetFunctionAliases(absl::string_view saved_model_path,
                    const std::unordered_set<std::string>& tags) {
@@ -70,4 +119,35 @@ void UpdateFunctionAliases(
   });
 }
 
+absl::StatusOr<ModuleOp> ImportSavedModel(
+    const absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const QuantizationConfig& quantization_config,
+    const absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  TF_ASSIGN_OR_RETURN(
+      ImportedMlirModuleOp imported_module,
+      SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
+  auto [module_op, saved_model_bundle] = std::move(imported_module);
+
+  UpdateFunctionAliases(function_aliases, module_op);
+
+  // Collect the names of the functions that have aliases so that they may not
+  // be inlined.
+  absl::flat_hash_set<std::string> aliased_function_names;
+  absl::c_for_each(function_aliases, [&](const auto& aliases) {
+    return aliased_function_names.insert(aliases.first);
+  });
+
+  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
+      mlir_dump_file_prefix, /*is_inliner_run=*/true,
+      /*noinline_functions=*/aliased_function_names, module_op, &ctx,
+      saved_model_bundle == nullptr ? nullptr
+                                    : saved_model_bundle->GetSession(),
+      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
+  return module_op;
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
index 2c20224cf24ed2..631d2e714900aa 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h
@@ -19,15 +19,40 @@ limitations under the License.
 
 #include <string>
 #include <unordered_set>
+#include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 
 namespace mlir::quant::stablehlo {
 
+// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
+// SavedModelBundle complements the imported ModuleOp by providing access to
+// `tensorflow::Session` which may be useful when reading values from resources
+// (e.g. `TF::VarHandleOp`s).
+using ImportedMlirModuleOp =
+    std::pair<ModuleOp, std::unique_ptr<::tensorflow::SavedModelBundle>>;
+
+// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
+//
+// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
+// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
+// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
+// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
+    absl::string_view saved_model_path,
+    const std::unordered_set<std::string>& tags,
+    const std::vector<std::string>& signature_keys,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
 // Gets the function aliases from the SavedModel.
 absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
 GetFunctionAliases(absl::string_view saved_model_path,
@@ -44,6 +69,18 @@ void UpdateFunctionAliases(
     absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
     ModuleOp module_op);
 
+// Loads a SavedModel to `mlir::ModuleOp` and performs preprocesses including
+// shape inference and graph freezing.
+// TODO: b/329206105 - Add unit tests after decomposing preprocessing passes.
+absl::StatusOr<ModuleOp> ImportSavedModel(
+    absl::string_view saved_model_path,
+    const std::vector<std::string>& signature_keys,
+    const std::unordered_set<std::string>& tags,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
+    absl::string_view mlir_dump_file_prefix,
+    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
+    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_SAVED_MODEL_IMPORT_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index eaafdf1770f7f9..015ab7605a05b7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -15,200 +15,44 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 
 #include <memory>
-#include <optional>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
-namespace {
 
 using ::stablehlo::quantization::QuantizationConfig;
-using ::stablehlo::quantization::io::GetLocalTmpFileName;
-using ::tensorflow::AssetFileDef;
-using ::tensorflow::MLIRImportOptions;
-using ::tensorflow::SavedModelBundle;
-using ::tensorflow::SavedModelSignatureDefsToMlirImport;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
-using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RunPasses;
-using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
-
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions& export_opts, MLIRContext& ctx, ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              PassManager& pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  FailureOr<SmallVector<AssetFileDef>> asset_file_defs =
-      quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
-// Represents a pair of `mlir::ModuleOp` and `tensorflow::SavedModelBundle`. The
-// SavedModelBundle complements the imported ModuleOp by providing access to
-// `tensorflow::Session` which may be useful when reading values from resources
-// (e.g. `TF::VarHandleOp`s).
-using ImportedMlirModuleOp =
-    std::pair<ModuleOp, std::unique_ptr<SavedModelBundle>>;
-
-// Loads a SavedModel at `saved_model_path` and converts it to `mlir::ModuleOp`.
-//
-// `tags` identify the `tensorflow::MetaGraphDef` to load from the SavedModel.
-// Similarly, `signature_keys` identify the functions (`SignatureDef`s) to load
-// within the `MetaGraphDef`. `ctx` is the `MLIRContext`, which should outlive
-// the returned `ModuleOp`, thus marked with the lifetime bound attribute.
-absl::StatusOr<ImportedMlirModuleOp> SavedModelToMlirModuleOp(
-    const absl::string_view saved_model_path,
-    const std::unordered_set<std::string>& tags,
-    const std::vector<std::string>& signature_keys,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  MLIRImportOptions import_options;
-  import_options.upgrade_legacy = true;
-  import_options.lift_variables = false;
-  import_options.include_variables_in_initializers = true;
-
-  auto bundle = std::make_unique<SavedModelBundle>();
-
-  // Copy to eliminate the `const` qualifier so that `absl::MakeSpan` can be
-  // called on it.
-  std::vector<std::string> exported_names = signature_keys;
-  absl::StatusOr<OwningOpRef<ModuleOp>> module_op =
-      SavedModelSignatureDefsToMlirImport(saved_model_path, tags,
-                                          absl::MakeSpan(exported_names), &ctx,
-                                          import_options, &bundle);
-  if (!module_op.status().ok()) {
-    return absl::InternalError(absl::StrCat("Failed to import SavedModel: ",
-                                            module_op.status().ToString()));
-  }
-
-  return std::make_pair(module_op->release(), std::move(bundle));
-}
-
-absl::StatusOr<ModuleOp> ImportSavedModel(
-    const absl::string_view saved_model_path,
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationConfig& quantization_config,
-    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND) {
-  TF_ASSIGN_OR_RETURN(
-      ImportedMlirModuleOp imported_module,
-      SavedModelToMlirModuleOp(saved_model_path, tags, signature_keys, ctx));
-  auto [module_op, saved_model_bundle] = std::move(imported_module);
-
-  UpdateFunctionAliases(function_aliases, module_op);
-
-  // Collect the names of the functions that have aliases so that they may not
-  // be inlined.
-  absl::flat_hash_set<std::string> aliased_function_names;
-  absl::c_for_each(function_aliases, [&](const auto& aliases) {
-    return aliased_function_names.insert(aliases.first);
-  });
-
-  TF_RETURN_IF_ERROR(PreprocessAndFreezeGraph(
-      /*mlir_dump_file_prefix=*/PreCalibrationComponent::kName,
-      /*is_inliner_run=*/true, /*noinline_functions=*/aliased_function_names,
-      module_op, &ctx,
-      saved_model_bundle == nullptr ? nullptr
-                                    : saved_model_bundle->GetSession(),
-      /*run_tf_to_stablehlo=*/true, /*deserialize_xla_call_module=*/false));
-  return module_op;
-}
-
-absl::StatusOr<ExportedModel> CreateExportedModel(
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationConfig& quantization_config,
-    absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
-    MLIRContext& ctx ABSL_ATTRIBUTE_LIFETIME_BOUND, ModuleOp module_op) {
-  TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
-  const ExportOptions export_opts = {
-      /*duplicate_shape_determining_constants=*/true,
-      /*unfreeze_constants=*/false, checkpoint_dir,
-      /*debug_name=*/
-      absl::StrCat(PostCalibrationComponent::kName, kExportStepSuffix)};
-
-  TF_ASSIGN_OR_RETURN(const SmallVector<AssetFileDef> asset_file_defs,
-                      RunExportPasses(export_opts, ctx, module_op));
-
-  UpdateFunctionAliases(function_aliases, module_op);
-
-  return ConvertMlirModuleToExportedModel(
-      module_op, checkpoint_dir, function_aliases,
-      {asset_file_defs.begin(), asset_file_defs.end()});
-}
-
-}  // namespace
 
 StaticRangePtqComponent::StaticRangePtqComponent(
     absl::Nonnull<MLIRContext*> ctx,
@@ -243,17 +87,13 @@ absl::StatusOr<ModuleOp> StaticRangePtqComponent::Run(
 absl::Status QuantizeStaticRangePtq(
     const absl::string_view src_saved_model_path,
     const absl::string_view dst_saved_model_path,
-    QuantizationConfig quantization_config,
+    const QuantizationConfig& quantization_config,
     const std::vector<std::string>& signature_keys,
     const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
     const PyFunctionLibrary& py_function_library) {
   std::unordered_set<std::string> tags;
   tags.insert(quantization_config.tf_saved_model().tags().begin(),
               quantization_config.tf_saved_model().tags().end());
-  if (!quantization_config.has_calibration_options()) {
-    *quantization_config.mutable_calibration_options() =
-        GetDefaultCalibrationOptions();
-  }
 
   std::unique_ptr<MLIRContext> ctx = CreateMlirContextForQuantization();
 
@@ -267,7 +107,8 @@ absl::Status QuantizeStaticRangePtq(
   TF_ASSIGN_OR_RETURN(
       ModuleOp module_op,
       ImportSavedModel(src_saved_model_path, signature_keys, tags,
-                       quantization_config, *function_aliases, *ctx));
+                       quantization_config, PreCalibrationComponent::kName,
+                       *function_aliases, *ctx));
 
   StaticRangePtqComponent static_range_ptq_component(
       ctx.get(), &py_function_library, src_saved_model_path, signature_keys,
@@ -278,7 +119,8 @@ absl::Status QuantizeStaticRangePtq(
   TF_ASSIGN_OR_RETURN(
       const ExportedModel post_calibrated_exported_model,
       CreateExportedModel(signature_keys, tags, quantization_config,
-                          *function_aliases, *ctx, module_op));
+                          PostCalibrationComponent::kName, *function_aliases,
+                          *ctx, module_op));
 
   // Remove the `tpu` tag for exporting because the output quantized model is
   // essentially a CPU model.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
index e5056418bbae55..69bd9da6733c0c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
@@ -37,17 +37,6 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
-using ::stablehlo::quantization::CalibrationOptions;
-
-// Create default configuration for the calibration step, which is the min/max
-// calibration method.
-inline CalibrationOptions GetDefaultCalibrationOptions() {
-  CalibrationOptions options{};
-  options.set_calibration_method(
-      CalibrationOptions::CALIBRATION_METHOD_MIN_MAX);
-  return options;
-}
-
 // Component for static-range post-training quantization (PTQ).
 // TODO: b/320607042 - Add tests in python level.
 class StaticRangePtqComponent : public Component {
@@ -102,7 +91,7 @@ class StaticRangePtqComponent : public Component {
 absl::Status QuantizeStaticRangePtq(
     absl::string_view src_saved_model_path,
     absl::string_view dst_saved_model_path,
-    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const ::stablehlo::quantization::QuantizationConfig& quantization_config,
     const std::vector<std::string>& signature_keys,
     const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
         signature_def_map,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
new file mode 100644
index 00000000000000..bbd9a9c25620bd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.cc
@@ -0,0 +1,114 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace mlir::quant::stablehlo {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::ExportedModel;
+using ::tensorflow::quantization::PyFunctionLibrary;
+using ::tensorflow::quantization::RunPasses;
+
+WeightOnlyPtqComponent::WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx)
+    : ctx_(ABSL_DIE_IF_NULL(ctx)) {}  // Crash OK
+
+absl::StatusOr<ModuleOp> WeightOnlyPtqComponent::Run(
+    ModuleOp module_op, const QuantizationConfig& config) {
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config](PassManager& pm) {
+        AddWeightOnlyQuantizationPasses(pm, config.specs(),
+                                        config.pipeline_config(),
+                                        config.debugger_config());
+      },
+      *ctx_, module_op));
+  return module_op;
+}
+
+absl::Status QuantizeWeightOnlyPtq(
+    const absl::string_view src_saved_model_path,
+    const absl::string_view dst_saved_model_path,
+    QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library) {
+  std::unordered_set<std::string> tags;
+  tags.insert(quantization_config.tf_saved_model().tags().begin(),
+              quantization_config.tf_saved_model().tags().end());
+
+  std::unique_ptr<MLIRContext> ctx = CreateMlirContextForQuantization();
+
+  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+      function_aliases = GetFunctionAliases(src_saved_model_path, tags);
+  if (!function_aliases.ok()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to get function alias: ", function_aliases.status().message()));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      ModuleOp module_op,
+      ImportSavedModel(src_saved_model_path, signature_keys, tags,
+                       quantization_config, WeightOnlyPtqComponent::kName,
+                       *function_aliases, *ctx));
+
+  WeightOnlyPtqComponent weight_only_ptq_component(ctx.get());
+  TF_ASSIGN_OR_RETURN(
+      module_op, weight_only_ptq_component.Run(module_op, quantization_config));
+
+  TF_ASSIGN_OR_RETURN(
+      const ExportedModel post_calibrated_exported_model,
+      CreateExportedModel(signature_keys, tags, quantization_config,
+                          WeightOnlyPtqComponent::kName, *function_aliases,
+                          *ctx, module_op));
+
+  // Remove the `tpu` tag for exporting because the output quantized model is
+  // essentially a CPU model.
+  tags.erase("tpu");
+
+  py_function_library.SaveExportedModel(
+      dst_saved_model_path, post_calibrated_exported_model,
+      src_saved_model_path, tags, signature_def_map);
+
+  return absl::OkStatus();
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
new file mode 100644
index 00000000000000..bf23e93246c700
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h
@@ -0,0 +1,80 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/component.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace mlir::quant::stablehlo {
+
+// Performs int8 weight-only quantization on dot_general ops.
+//
+// The resulting `ModuleOp` contains quantized StableHLO ops serialized in
+// `TF::XlaCallModuleOp`s. They are quantized using the weight constants, not
+// relying on calibration.
+class WeightOnlyPtqComponent : public Component {
+ public:
+  // Used for debugging purposes.
+  static constexpr absl::string_view kName = "quant_ptq_weight_only";
+
+  explicit WeightOnlyPtqComponent(absl::Nonnull<MLIRContext*> ctx);
+
+  absl::StatusOr<ModuleOp> Run(
+      ModuleOp module_op,
+      const ::stablehlo::quantization::QuantizationConfig& config) override;
+
+ private:
+  absl::Nonnull<MLIRContext*> ctx_;
+};
+
+// Runs weight-only quantization on a SavedModel at
+// `src_saved_model_path` and saves the resulting model to
+// `dst_saved_model_path`.
+//
+// `quantization_config` configures the quantization behavior for the
+// weight-only quantization.
+//
+// `signature_keys` specify the signatures that correspond to functions to be
+// quantized. `signature_def_map` connects the signature keys to
+// `SignatureDef`s.
+//
+// Returns a non-OK status when the quantization is not successful.
+// LINT.IfChange
+absl::Status QuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path,
+    ::stablehlo::quantization::QuantizationConfig quantization_config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+// LINT.ThenChange(../python/pywrap_quantization.cc:weight_only_ptq)
+
+}  // namespace mlir::quant::stablehlo
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_WEIGHT_ONLY_PTQ_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
index 0f8bb04d796a6e..35584857f5761f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -41,12 +41,14 @@ tf_cc_test(
         ":stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@stablehlo//:stablehlo_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
index a78a1feec9077e..c78ee607993385 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -47,6 +47,7 @@ namespace {
 
 using ::mlir::stablehlo::DotGeneralOp;
 using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::StaticRangePtq;
 
 // Whether it represents a lifted function (i.e. `op` is the corresponding
 // `XlaCallModuleOp`) that is explicitly marked `NoQuantization`.
@@ -61,6 +62,31 @@ bool IsDenylistedLiftedFunction(Operation* op) {
   return false;
 }
 
+// Populates `spec.coeff_op_quant_dim` according to `xla_call_module_op`'s
+// `_quantization_method` attribute. If there is an input `QuantizedType` with
+// `dimension_specs` set, which represents the quantization dimension for the
+// input, then the corresponding operand index -> quantization dimension mapping
+// is set for `spec`.
+// TODO: b/323478683 - Duplicate tracking of config will be eliminated.
+// `OpQuantSpec` will be deprecated and `Method` will be used instead.
+void PopulateCoeffOpQuantDimIfPerChannelQuantized(
+    TF::XlaCallModuleOp xla_call_module_op, OpQuantSpec& spec) {
+  absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+  if (method.ok() && method->has_static_range_ptq()) {
+    // TODO: b/331145946 - Use `Method` accessors.
+    const StaticRangePtq& static_range_ptq_spec = method->static_range_ptq();
+    // Look for quantized dimension specs for each quantized type and
+    // populate `coeff_op_quant_dim`.
+    for (const auto& [operand_idx, quantized_type] :
+         static_range_ptq_spec.input_quantized_types()) {
+      if (quantized_type.has_dimension_specs()) {
+        spec.coeff_op_quant_dim[operand_idx] =
+            quantized_type.dimension_specs().dimension();
+      }
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
@@ -72,8 +98,12 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
     if (!function_name.starts_with("composite_")) {
       return spec;
     }
+
     if (function_name.contains("conv")) {
-      spec->coeff_op_quant_dim[1] = 3;
+      // Looks up `Method` to see if it should be per-channel quantized and
+      // populates the spec accordingly.
+      PopulateCoeffOpQuantDimIfPerChannelQuantized(call_op, *spec);
+
       if (function_name.contains("with_bias")) {
         spec->biases_params[2] = {{0, 1},
                                   quant::GetUniformQuantizedTypeForBias};
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
index 39baea749992d1..b3ba4818284498 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
@@ -15,14 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
 
+#include <memory>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
@@ -30,7 +34,10 @@ limitations under the License.
 namespace mlir::quant::stablehlo {
 namespace {
 
+using ::testing::IsEmpty;
 using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
 
 using IsOpQuantizableStableHloTest = ::mlir::quant::QuantizationTestBase;
 
@@ -208,5 +215,74 @@ TEST_F(IsOpQuantizableStableHloTest, DenylistedXlaCallModuleOpNotQuantizable) {
   EXPECT_FALSE(IsOpQuantizableStableHlo(xla_call_module_op));
 }
 
+using GetStableHloOpQuantSpecTest = ::mlir::quant::QuantizationTestBase;
+
+TEST_F(GetStableHloOpQuantSpecTest,
+       EmptyCoeffOpQuantDimForPerTensorQuantizedConvolution) {
+  // A `TF::XlaCallModuleOp` with `_quantization_method = "static_range_ptq
+  // {}"`, representing a per-tensor static-range PTQ quantization.
+  constexpr absl::string_view
+      kXlaCallModuleOpWithPerTensorQuantizedConvolution = R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_conv_fn_1,
+            _original_entry_function = "composite_conv_fn_1",
+            _quantization_method = "static_range_ptq {}",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true},
+            _tfl_quant_trait = "fully_quantizable"
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithPerTensorQuantizedConvolution);
+  ASSERT_TRUE(module_op);
+
+  const FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  const std::unique_ptr<OpQuantSpec> op_quant_spec =
+      GetStableHloOpQuantSpec(*xla_call_module_op);
+  ASSERT_THAT(op_quant_spec, NotNull());
+
+  EXPECT_THAT(op_quant_spec->coeff_op_quant_dim, IsEmpty());
+}
+
+TEST_F(GetStableHloOpQuantSpecTest,
+       EmptyCoeffOpQuantDimForPerChannelQuantizedConvolution) {
+  constexpr absl::string_view
+      kXlaCallModuleOpWithPerChannelQuantizedConvolution = R"mlir(
+    func.func @main(%arg0: tensor<1x1x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<1x1x4xf32> {
+      %0 = "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [#tf_type.shape<1x1x4>], dim_args_spec = [], disabled_checks = [], function_list = [], has_token_input_output = false, module = "", platforms = ["CPU"], version = 9 : i64}>
+          {
+            _entry_function = @composite_conv_fn_1,
+            _original_entry_function = "composite_conv_fn_1",
+            _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+            _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true},
+            _tfl_quant_trait = "fully_quantizable"
+          } : (tensor<1x1x3xf32>, tensor<3x4xf32>) -> tensor<1x1x4xf32>
+      return %0 : tensor<1x1x4xf32>
+    }
+  )mlir";
+
+  const OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kXlaCallModuleOpWithPerChannelQuantizedConvolution);
+  ASSERT_TRUE(module_op);
+
+  const FailureOr<TF::XlaCallModuleOp> xla_call_module_op =
+      FindFirstOpFromMainFunc<TF::XlaCallModuleOp>(*module_op);
+  ASSERT_TRUE(succeeded(xla_call_module_op));
+
+  const std::unique_ptr<OpQuantSpec> op_quant_spec =
+      GetStableHloOpQuantSpec(*xla_call_module_op);
+  ASSERT_THAT(op_quant_spec, NotNull());
+
+  EXPECT_THAT(op_quant_spec->coeff_op_quant_dim,
+              UnorderedElementsAre(Pair(1, 3)));
+}
+
 }  // namespace
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
index b07b833429f8b6..33d66316870798 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.cc
@@ -26,8 +26,7 @@ namespace mlir::quant::stablehlo {
 void AddQuantizationLoweringPasses(mlir::OpPassManager& pm) {
   // These passes are grouped together and must run in this specific order.
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertTFQuantOpsToMHLOPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mhlo::createChloLegalizeToHloPass(
-      /*legalizeBroadcasts=*/true, /*expandCompositions=*/false));
+  pm.addNestedPass<mlir::func::FuncOp>(mhlo::createChloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mhlo::createMhloQuantLegalizeToIntPass());
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
new file mode 100644
index 00000000000000..5be09ce2ad47ef
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -0,0 +1,288 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "absl/base/nullability.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
+
+namespace mlir::quant::stablehlo {
+
+#define GEN_PASS_DEF_DEFERACTIVATIONTRANSPOSEPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
+
+namespace {
+
+using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::BroadcastInDimOp;
+using ::mlir::stablehlo::MaxOp;
+using ::mlir::stablehlo::TransposeOp;
+
+// Returns `success()` if `op` is a `TransposeOp` with permutation attribute
+// equivalent to `permuation`.
+LogicalResult IsTransposeOpWithPermuation(absl::Nullable<Operation*> op,
+                                          const ArrayRef<int64_t> permutation) {
+  auto transpose_op = dyn_cast_or_null<TransposeOp>(op);
+  return success(transpose_op != nullptr && transpose_op.getPermutation() ==
+                                                ArrayRef<int64_t>(permutation));
+}
+
+// Convenience function to create a `TransposeOp` with a given `permutation`.
+// The Location is set as `input`'s loc.
+TransposeOp CreateTransposeOp(Value input, const ArrayRef<int64_t> permutation,
+                              PatternRewriter& rewriter) {
+  return rewriter.create<TransposeOp>(
+      input.getLoc(), input, rewriter.getDenseI64ArrayAttr(permutation));
+}
+
+// Defers the transpose of the left-hand side (LHS) to the right-hand side and
+// the result of a binary operation. In detail, this rewrites the
+// `op(transpose(%rhs), %lhs)` to `transpose(op(%rhs, transpose(%lhs)))`. The
+// LHS transpose permutation must be a NCHW->NHWC permutation.
+template <typename OpT>
+void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
+  auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+  Value lhs_pre_transpose = transpose_op.getOperand();
+
+  // NCHW -> NHWC for the right-hand side, to match the operand's shape.
+  Value rhs = op.getOperand(1);
+  TransposeOp rhs_transpose_op = CreateTransposeOp(
+      /*input=*/rhs, kNchwToNhwcPermutation, rewriter);
+
+  auto new_binary_op =
+      rewriter.create<OpT>(op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
+
+  // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
+  TransposeOp output_transpose_op = CreateTransposeOp(
+      /*input=*/new_binary_op, kNhwcToNchwPermutation, rewriter);
+
+  rewriter.replaceAllUsesWith(op.getResult(), output_transpose_op);
+}
+
+// "Climbs up" the `op` if `op` is a `BraodcastInDimOp` and returns the defining
+// op of its operand. Returns `op` otherwise. May return `nullptr` when the
+// `BroadcastInDimOp`'s operand is a block argument.
+absl::Nullable<Operation*> SkipUpwardsOptionalBroadcastInDimOp(
+    absl::Nonnull<Operation*> op) {
+  if (auto broadcast_in_dim_op = dyn_cast_or_null<BroadcastInDimOp>(op);
+      broadcast_in_dim_op != nullptr) {
+    return broadcast_in_dim_op.getOperand().getDefiningOp();
+  }
+  return op;
+}
+
+class DeferActivationTransposeForAddOp : public OpRewritePattern<AddOp> {
+ public:
+  using OpRewritePattern<AddOp>::OpRewritePattern;
+
+  LogicalResult match(AddOp op) const override {
+    // Only supports the case for 2D convolution.
+    const Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    const Value rhs = op.getOperand(1);
+    Operation* rhs_op = rhs.getDefiningOp();
+    if (rhs_op == nullptr) return failure();
+
+    // Ignore the optional `BroadcastInDimOp` in between the constant and RHS.
+    rhs_op = SkipUpwardsOptionalBroadcastInDimOp(rhs_op);
+
+    if (rhs_op == nullptr || !rhs_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    // Match LHS permutation that converts: NHWC -> NCHW.
+    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
+  }
+
+  void rewrite(AddOp op, PatternRewriter& rewriter) const override {
+    DeferRhsTransposeForBinaryOp(op, rewriter);
+  }
+};
+
+// Rewrites the `reduce_window(transpose(%activation), %init_value)` patterns to
+// `transpose(reduce_window(%activation), %init_value)`, deferring the transpose
+// to the result. The reduce function should be equivalent to
+// `stablehlo.maximum`, representing max pooling.
+class DeferActivationTransposeForMaxPoolReduceWindowOp
+    : public OpRewritePattern<mlir::stablehlo::ReduceWindowOp> {
+ public:
+  using OpRewritePattern<mlir::stablehlo::ReduceWindowOp>::OpRewritePattern;
+
+  LogicalResult match(mlir::stablehlo::ReduceWindowOp op) const override {
+    if (failed(MatchMaxPoolReduceWindowOp(op))) return failure();
+
+    // Match only when the lhs is connected to a transpose.
+    // Only supports the case commonly appearing for 2D convolutions.
+    Value lhs = op.getOperand(0);
+    if (!HasRankOf(lhs, /*rank=*/4)) return failure();
+
+    // Match input permutation that converts: NHWC -> NCHW.
+    return IsTransposeOpWithPermuation(lhs.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
+  }
+
+  // Pushes the transpose op at the input to the result.
+  void rewrite(mlir::stablehlo::ReduceWindowOp op,
+               PatternRewriter& rewriter) const override {
+    auto transpose_op = cast<TransposeOp>(op.getOperand(0).getDefiningOp());
+
+    const auto result_type = op.getResult(0).getType().cast<TensorType>();
+    const SmallVector<int64_t> new_result_shape =
+        Permute<int64_t>(result_type.getShape(), kNchwToNhwcPermutation);
+
+    const TensorType new_result_type =
+        result_type.cloneWith(new_result_shape, result_type.getElementType());
+
+    // Create a new `stablehlo.reduce_window` with all relevant attributes
+    // permutated to match the new operand & result type.
+    auto new_reduce_window_op =
+        rewriter.create<mlir::stablehlo::ReduceWindowOp>(
+            op.getLoc(), new_result_type, transpose_op.getOperand(),
+            /*init_value=*/op.getOperand(1),
+            /*window_dimensions=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDimensionsAttr(),
+                                kNchwToNhwcPermutation),
+            /*window_strides=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowStridesAttr(),
+                                kNchwToNhwcPermutation),
+            /*base_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getBaseDilationsAttr(),
+                                kNchwToNhwcPermutation),
+            /*window_dilations=*/
+            PermuteI64ArrayAttr(rewriter, op.getWindowDilationsAttr(),
+                                kNchwToNhwcPermutation),
+            /*padding=*/DenseIntElementsAttr(nullptr));
+
+    // Clone the reduce body. It is not affected by the permutation.
+    IRMapping mapping;
+    op.getBody().cloneInto(&new_reduce_window_op.getBody(), mapping);
+
+    // Introduce a transpose to the result to match the shapes of `op`'s uses.
+    TransposeOp result_transpose_op = CreateTransposeOp(
+        /*input=*/new_reduce_window_op.getResult(0), kNhwcToNchwPermutation,
+        rewriter);
+
+    rewriter.replaceAllUsesWith(op.getResult(0), result_transpose_op);
+  }
+
+ private:
+  // Permutes `array_attr` with `permutation`. The number of elements in
+  // `array_attr` and `permutation` must be equal. Returns a null attribute
+  // if `array_attr` is null.
+  DenseI64ArrayAttr PermuteI64ArrayAttr(
+      PatternRewriter& rewriter, const DenseI64ArrayAttr array_attr,
+      const ArrayRef<int64_t> permutation) const {
+    if (array_attr == nullptr) return DenseI64ArrayAttr(nullptr);
+
+    return rewriter.getDenseI64ArrayAttr(
+        Permute<int64_t>(array_attr, permutation));
+  }
+
+  LogicalResult MatchMaxPoolReduceWindowOp(
+      mlir::stablehlo::ReduceWindowOp op) const {
+    // TODO: b/321099943 - Support explicit padding.
+    if (HasPadding(op)) return failure();
+
+    // Check that the reduce-window body is a max operation.
+    return success(IsMaxFunction(op.getBody().front()));
+  }
+
+  // Whether `block` semantically corresponds to a `stablehlo.maximum` op.
+  bool IsMaxFunction(Block& block) const {
+    if (block.getNumArguments() != 2) return false;
+
+    auto return_op = cast<mlir::stablehlo::ReturnOp>(block.getTerminator());
+    if (return_op.getNumOperands() != 1) return false;
+
+    auto max_op = dyn_cast_or_null<MaxOp>(
+        return_op.getOperands().front().getDefiningOp());
+    if (!max_op) return false;
+
+    return (max_op.getLhs() == block.getArgument(0)) &&
+           (max_op.getRhs() == block.getArgument(1));
+  }
+
+  // Whether `op` has the `padding` attribute (which is optional).
+  bool HasPadding(mlir::stablehlo::ReduceWindowOp op) const {
+    return op.getPadding() != std::nullopt;
+  }
+};
+
+// Rewrites `maximum(transpose(%rhs), %lhs)` patterns to
+// `transpose(maximum(%rhs, transpose(%lhs)))`.
+class DeferActivationTransposeForMaxOp : public OpRewritePattern<MaxOp> {
+ public:
+  using OpRewritePattern<MaxOp>::OpRewritePattern;
+
+  LogicalResult match(MaxOp op) const override {
+    Value input = op.getOperand(0);
+    if (!HasRankOf(input, /*rank=*/4)) return failure();
+
+    const Value max_value = op.getOperand(1);
+    Operation* max_value_op = max_value.getDefiningOp();
+    if (max_value_op == nullptr ||
+        !max_value_op->hasTrait<OpTrait::ConstantLike>()) {
+      return failure();
+    }
+
+    return IsTransposeOpWithPermuation(input.getDefiningOp(),
+                                       kNhwcToNchwPermutation);
+  }
+
+  void rewrite(MaxOp op, PatternRewriter& rewriter) const override {
+    DeferRhsTransposeForBinaryOp(op, rewriter);
+  }
+};
+
+}  // namespace
+
+class DeferActivationTransposePass
+    : public impl::DeferActivationTransposePassBase<
+          DeferActivationTransposePass> {
+ private:
+  void runOnOperation() override;
+};
+
+void DeferActivationTransposePass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  MLIRContext& ctx = getContext();
+
+  RewritePatternSet patterns(&ctx);
+  patterns.add<DeferActivationTransposeForAddOp,
+               DeferActivationTransposeForMaxPoolReduceWindowOp,
+               DeferActivationTransposeForMaxOp>(&ctx);
+  if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
+    func_op->emitWarning() << "Failed to converge patterns: " << getArgument();
+  }
+}
+
+}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose_pass.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
similarity index 92%
rename from tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose_pass.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 52a101b997ad89..051745c0d6792b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose_pass.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -53,17 +54,6 @@ int64_t GetContiguousOffset(const ArrayRef<int64_t> indices,
   return contiguous_offset;
 }
 
-// Permutes `values` with `permutation`. Returns the permuted values. Sizes of
-// `values` and `permutation` must be equal.
-SmallVector<int64_t> Permute(const ArrayRef<int64_t> values,
-                             const ArrayRef<int64_t> permutation) {
-  SmallVector<int64_t> permuted_values(/*Size=*/values.size(), /*Value=*/0);
-  for (auto [i, permutation_idx] : llvm::enumerate(permutation)) {
-    permuted_values[i] = values[permutation_idx];
-  }
-  return permuted_values;
-}
-
 // Performs transposition of a tensor represented as a contiguous element array.
 // Assumes row-major order. The shape of the input tensor and the desired
 // permutation is registered during construction, and calling `TransposeValues`
@@ -74,7 +64,7 @@ class DenseElementsTransposer {
                           const ArrayRef<int64_t> permutation)
       : rank_(original_shape.size()),
         original_shape_(original_shape),
-        target_shape_(Permute(original_shape, permutation)),
+        target_shape_(Permute<int64_t>(original_shape, permutation)),
         permutation_(permutation) {}
 
   // Transposes `values` with the permutation. Returns the transposed values.
@@ -102,7 +92,7 @@ class DenseElementsTransposer {
           GetContiguousOffset(current_indices, original_shape_);
 
       const SmallVector<int64_t> target_indices =
-          Permute(current_indices, permutation_);
+          Permute<int64_t>(current_indices, permutation_);
       const int64_t target_index =
           GetContiguousOffset(target_indices, target_shape_);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
similarity index 90%
rename from tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
index 77a389398270a9..9fb1e9e985d15e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize_hybrid.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_weight_param.cc
@@ -47,20 +47,21 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
-#define GEN_PASS_DEF_PREPAREQUANTIZEHYBRIDPASS
+#define GEN_PASS_DEF_INSERTWEIGHTPARAMPASS
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
 
 namespace {
 
-// Prepare hybrid quantization for weight-only quantization and dynamic range
-// quantization of `stablehlo.convolution` and `stablehlo.dot_general`.
-class PrepareQuantizeHybridPass
-    : public impl::PrepareQuantizeHybridPassBase<PrepareQuantizeHybridPass> {
+// Inserts quantization parameters of weights for weight-only quantization and
+// dynamic range quantization of `stablehlo.convolution` and
+// `stablehlo.dot_general`.
+class InsertWeightParamPass
+    : public impl::InsertWeightParamPassBase<InsertWeightParamPass> {
  public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizeHybridPass)
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InsertWeightParamPass)
 
-  using impl::PrepareQuantizeHybridPassBase<
-      PrepareQuantizeHybridPass>::PrepareQuantizeHybridPassBase;
+  using impl::InsertWeightParamPassBase<
+      InsertWeightParamPass>::InsertWeightParamPassBase;
 
  private:
   void runOnOperation() override;
@@ -96,7 +97,8 @@ class InsertWeightParamPattern
       return false;
     }
     Operation* user = operand.getOwner();
-    if (auto call_op = cast<TF::XlaCallModuleOp>(user)) {
+    if (isa<TF::XlaCallModuleOp>(user)) {
+      auto call_op = cast<TF::XlaCallModuleOp>(user);
       const StringRef function_name = GetEntryFunctionName(call_op);
       const bool is_conv_or_dot = function_name.contains("conv") ||
                                   function_name.contains("dot_general");
@@ -134,7 +136,7 @@ class InsertWeightParamPattern
   }
 };
 
-void PrepareQuantizeHybridPass::runOnOperation() {
+void InsertWeightParamPass::runOnOperation() {
   func::FuncOp func = getOperation();
   MLIRContext* context = func.getContext();
   RewritePatternSet patterns(context);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index 13fe29fe787324..a4bf42ec6f8eba 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/str_replace.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -76,6 +77,12 @@ bool FloatValueEquals(const Attribute& attr, const double value) {
   });
 }
 
+inline void TrimTrailingWhitespaces(std::string& str) {
+  while (!str.empty() && str.back() == ' ') {
+    str.pop_back();
+  }
+}
+
 // Lifts quantizable units as separate functions, thereby identifying the
 // boundaries of quantizable subgraphs. `QuantizationSpecs` influences how
 // quantizable units are lifted.
@@ -146,16 +153,22 @@ class FunctionNameMatcher {
   std::unique_ptr<RE2> match_regex_;  // NOLINT
 };
 
-// Converts `Method` to text proto representation. All newline characters are
-// removed.
+// Converts `Method` to a single-line textproto representation. Returns
+// `failure()` when converting to textproto failed.
 FailureOr<std::string> QuantizationMethodToTextProto(const Method& method) {
+  TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+
   std::string method_txtpb;
-  if (!TextFormat::PrintToString(method, &method_txtpb)) {
+  if (!printer.PrintToString(method, &method_txtpb)) {
+    LLVM_DEBUG(llvm::dbgs() << "Failed to convert Method to textproto\n.");
     return failure();
   }
 
-  // Remove newlines.
-  absl::StrReplaceAll({{"\n", ""}}, &method_txtpb);
+  // Single line mode might have an extra space at the end, due to the internal
+  // details of `Printer`.
+  TrimTrailingWhitespaces(method_txtpb);
+
   return method_txtpb;
 }
 
@@ -168,11 +181,6 @@ LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
   if (!main_func) return failure();
 
   const Method& quantization_method = spec.method();
-  if (!quantization_method.has_no_quantization()) {
-    module_op->emitError() << "Unsupported quantization method: "
-                           << quantization_method.DebugString() << "\n";
-    return failure();
-  }
 
   FailureOr<std::string> quantization_method_txtpb =
       QuantizationMethodToTextProto(quantization_method);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
index 07598356cce7d3..eaa8a9092f41f2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
@@ -67,3 +67,11 @@ def LiftGather : Pat<
       (NamedAttr<"slice_sizes"> $slice_sizes),
       (NamedAttr<"indices_are_sorted"> (DefaultOrNullAttr $indices_are_sorted)))),
   [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $operand)], [], (addBenefit 1)>;
+
+def LiftAdd : Pat<
+  (StableHLO_AddOp:$res
+      $lhs, $rhs),
+  (LiftAsTFXlaCallModule<"composite_add_fn">
+    (ArgumentList $lhs, $rhs),
+    (ResultList $res)),
+  [(IsNotInLiftedFunc $res), (IsNotInStableHloOpRegion $res)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 5ba80df30a9f2d..521f701598fb0a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/permutation.h"
 
 namespace mlir::quant::stablehlo {
 
@@ -72,20 +73,20 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the input tensor: [b, f, 0, 1] => [b, 0, 1, f]
     Value input = op->getOperand(0);
     const TensorType new_input_tensor_type = GetTransposedTensorType(
-        input.getType().cast<TensorType>(), kActivationPermutation);
+        input.getType().cast<TensorType>(), kNchwToNhwcPermutation);
 
     auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
-        rewriter.getDenseI64ArrayAttr(kActivationPermutation));
+        rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
 
     // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
     Value filter = op->getOperand(1);
     const TensorType new_filter_tensor_type = GetTransposedTensorType(
-        filter.getType().cast<TensorType>(), kFilterPermutation);
+        filter.getType().cast<TensorType>(), kOihwToHwioPermutation);
 
     auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
-        rewriter.getDenseI64ArrayAttr(kFilterPermutation));
+        rewriter.getDenseI64ArrayAttr(kOihwToHwioPermutation));
 
     // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
     const auto new_dimension_nums = rewriter.getAttr<ConvDimensionNumbersAttr>(
@@ -99,7 +100,7 @@ class RewriteNchwConvolutionToNhwc
     // Determine the shape of the output tensor: [b, f, 0, 1] => [b, 0, 1, f]
     auto output_tensor_type = op->getResult(0).getType().cast<TensorType>();
     const TensorType new_conv_output_tensor_type =
-        GetTransposedTensorType(output_tensor_type, kOutputPermutation);
+        GetTransposedTensorType(output_tensor_type, kNchwToNhwcPermutation);
 
     // window_strides, padding, lhs_dilation, rhs_dilation, window_reversal are
     // reused without modification because the ordering of spatial dimensions
@@ -125,31 +126,12 @@ class RewriteNchwConvolutionToNhwc
     auto output_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
         new_convolution_op.getLoc(), /*resultType0=*/output_tensor_type,
         /*operand=*/new_convolution_op,
-        rewriter.getDenseI64ArrayAttr(kOutputReversePermutation));
+        rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
     rewriter.replaceAllUsesWith(op, output_transpose_op);
   }
 
  private:
-  // Permutation to transpose the input tensor from [b, f, 0, 1] to
-  // [b, 0, 1, f].
-  static constexpr std::array<int64_t, 4> kActivationPermutation = {0, 2, 3, 1};
-
-  // Permutation to transpose the filter tensor from [o, i, 0, 1] to
-  // [0, 1, i, o].
-  static constexpr std::array<int64_t, 4> kFilterPermutation = {2, 3, 1, 0};
-
-  // Permutation to transpose the output tensor from [b, f, 0, 1] to
-  // [b, 0, 1, f]. This is used to determine the shape of the new
-  // `ConvolutionOp`'s output tensor.
-  static constexpr std::array<int64_t, 4> kOutputPermutation = {0, 2, 3, 1};
-
-  // Permutation to transpose the output tensor from [b, 0, 1, f] to
-  // [b, f, 0, 1]. This is used to revert the new output tensor of
-  // `ConvolutionOp` with a `TransposeOp`.
-  static constexpr std::array<int64_t, 4> kOutputReversePermutation = {0, 3, 1,
-                                                                       2};
-
   // Matches input dimensions corresponding to: [b, f, 0, 1].
   bool MatchInputDimensionNumbers(
       const ConvDimensionNumbersAttr dimension_numbers) const {
@@ -183,21 +165,9 @@ class RewriteNchwConvolutionToNhwc
   TensorType GetTransposedTensorType(
       const TensorType type, const ArrayRef<int64_t> permutation) const {
     const SmallVector<int64_t> after_shape =
-        PermuteShape(type.getShape(), permutation);
+        Permute<int64_t>(type.getShape(), permutation);
     return type.cloneWith(after_shape, type.getElementType());
   }
-
-  // Permutes the shape according to the permutation. The size of `shape` and
-  // `permutation` should be equal.
-  SmallVector<int64_t> PermuteShape(const ArrayRef<int64_t> shape,
-                                    const ArrayRef<int64_t> permutation) const {
-    const int64_t size = shape.size();
-    SmallVector<int64_t, 4> after_shape(size);
-    for (int i = 0; i < size; ++i) {
-      after_shape[i] = shape[permutation[i]];
-    }
-    return after_shape;
-  }
 };
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index fb3f5fcb0a21c3..63f6f822dbebdf 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -60,13 +60,17 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+        "enable-full-int-quantization",
+        "bool", /*default=*/"false",
+        "Whether to enable full int quantization, including non compute-heavy ops.">,
     Option<"mlir_dump_file_name_", "mlir-dump-file-name",
         "std::optional<std::string>", /*default=*/"std::nullopt",
         "MLIR dump file name.">,
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for dot_general op.">,
+        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
   ];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
@@ -102,10 +106,14 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+      "enable-full-int-quantization",
+      "bool", /*default=*/"false",
+      "Whether to apply full int quantization, including non compute-heavy ops.">,
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
-        "Whether to produce weight-only quantized op for dot_general op.">,
+        "Whether to produce weight-only quantized op for convolution and dot_general op.">,
   ];
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect",
@@ -162,8 +170,22 @@ def NchwConvolutionToNhwcPass : Pass<"stablehlo-nchw-convolution-to-nhwc", "mlir
   let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
 }
 
-def PrepareQuantizeHybridPass : Pass<"stablehlo-prepare-quantize-hybrid", "mlir::func::FuncOp"> {
-  let summary = "Prepare hybrid quantization for weight-only quantization and dynamic range quantization.";
+def DeferActivationTransposePass : Pass<"stablehlo-defer-activation-transpose", "mlir::func::FuncOp"> {
+  let summary = "Merges stablehlo.transpose for activations.";
+  let description = [{
+    Defers activation transposes (e.g. LHS of `stablehlo.add`) to the output and
+    optionally inserts `stablehlo.transpose`s to match the shape of operands.
+    This is useful when recursively pushing down the extra `stablehlo.transpose`
+    inserted to activation tensors after running `NchwConvolutionToNhwcPass`.
+
+    Currently only converts limited cases that appear in NCHW->NHWC 2D
+    convolution conversion, to avoid introducing unwanted pessimizations.
+  }];
+  let dependentDialects = ["mlir::stablehlo::StablehloDialect"];
+}
+
+def InsertWeightParamPass : Pass<"stablehlo-insert-weight-param", "mlir::func::FuncOp"> {
+  let summary = "Insert quantization parameters of weights for weight-only quantization and dynamic range quantization.";
   let dependentDialects = [
       "mlir::stablehlo::StablehloDialect",
       "TF::TensorFlowDialect",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 72702621f6e8b4..10b15f1132fe62 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -49,9 +48,11 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -73,6 +74,9 @@ using ::mlir::stablehlo::GatherOp;
 using ::mlir::stablehlo::GetDimensionSizeOp;
 using ::mlir::stablehlo::ReshapeOp;
 using ::mlir::stablehlo::UniformQuantizeOp;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizedType;
+using ::stablehlo::quantization::StaticRangePtq;
 
 constexpr StringRef kCompositeFuncPrefix = "composite_";
 constexpr StringRef kQuantizedFuncPrefix = "quantized_";
@@ -139,22 +143,6 @@ Operation* GetBroadcastedUserOp(Operation* op) {
   return target_op;
 }
 
-// Checks if one of the inputs and outputs are quantized.
-bool HasQuantizedOperandOrOutput(Operation* call_op) {
-  SmallVector<Type> arg_types;
-  for (const Value arg : call_op->getOperands()) {
-    arg_types.push_back(arg.getType());
-  }
-
-  SmallVector<Type> output_types;
-  for (const Value output : call_op->getResults()) {
-    output_types.push_back(output.getType());
-  }
-
-  return absl::c_any_of(arg_types, IsQuantizedTensorType) &&
-         absl::c_any_of(output_types, IsQuantizedTensorType);
-}
-
 // Gets the corresponding quantized function name from the given function name.
 // Example: "composite_dot_general_fn_1" => "quantized_dot_general_fn"
 std::string GetQuantizedFunctionName(const StringRef func_name) {
@@ -170,7 +158,7 @@ std::string GetQuantizedFunctionName(const StringRef func_name) {
 // 3. It should also have the `kEntryFuncAttrName` attribute, which points to
 //    the function that `xla_call_module_op` represents.
 bool IsQuantizedXlaCallModuleOp(TF::XlaCallModuleOp xla_call_module_op) {
-  return HasQuantizedOperandOrOutput(xla_call_module_op) &&
+  return !IsOpNotQuantized(xla_call_module_op) &&
          xla_call_module_op->hasAttr(kQuantTraitAttrName) &&
          xla_call_module_op->hasAttr(kEntryFuncAttrName);
 }
@@ -287,6 +275,7 @@ class EntryFuncBodyQuantizationPattern {
 
   // Rewrites the `entry_func_op`'s body.
   virtual void rewrite(func::FuncOp entry_func_op,
+                       const Method& quantization_method,
                        PatternRewriter& rewriter) const = 0;
 };
 
@@ -417,63 +406,23 @@ void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
   }
 }
 
-template <typename SingularOp>
-// Match for tensor manipulation op.
-LogicalResult MatchSingularOp(func::FuncOp entry_func_op) {
-  const auto op_iterator_range = entry_func_op.getOps<SingularOp>();
-  if (op_iterator_range.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "Function does not have "
-                            << SingularOp::getOperationName() << " op.\n");
-    return failure();
-  }
-  if (!isa<RankedTensorType>(
-          (*op_iterator_range.begin()).getResult().getType())) {
-    LLVM_DEBUG(llvm::dbgs() << SingularOp::getOperationName()
-                            << " op must have ranked tensor type.\n");
-    return failure();
-  }
-  return success();
-}
-
-template <typename SingularOp>
-void RewriteSingularOp(func::FuncOp entry_func_op, PatternRewriter& rewriter) {
-  SingularOp singular_op = *entry_func_op.getOps<SingularOp>().begin();
-
-  const Type operand_type = entry_func_op.getArgumentTypes()[0];
-  const Type func_result_type = entry_func_op.getResultTypes()[0];
-
-  // Get the quantized tensor manipulation op's output type and update.
-  Value singular_op_result = singular_op.getResult();
-  const auto singular_op_result_type =
-      singular_op_result.getType().cast<RankedTensorType>();
-  const ArrayRef<int64_t> singular_op_shape =
-      singular_op_result_type.getShape();
-  const TensorType new_singular_op_result_type =
-      singular_op_result_type.cloneWith(
-          singular_op_shape,
-          getElementTypeOrSelf(operand_type).cast<UniformQuantizedType>());
-  singular_op_result.setType(new_singular_op_result_type);
-
-  // Create requantization op and return.
-  rewriter.setInsertionPointAfter(singular_op);
-  CreateAndReturnUniformQuantizeOp(rewriter, *singular_op, entry_func_op,
-                                   func_result_type);
-}
-
 // Quantizes the entry function's body containing a `DotGeneralOp`.
 class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeDotGeneralOpPattern(
-      const bool enable_per_channel_quantized_weight)
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
+    if (enable_weight_only_) return;
     DotGeneralOp dot_general_op = *entry_func_op.getOps<DotGeneralOp>().begin();
     const bool should_quantize_per_channel =
         enable_per_channel_quantized_weight_ &&
@@ -483,44 +432,92 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
 // Quantizes the entry function's body containing a `ConvolutionOp`.
 class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
   explicit QuantizeConvolutionOpPattern(
-      const bool enable_per_channel_quantized_weight)
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
-    RewriteGemmStyleOp<ConvolutionOp>(entry_func_op, rewriter,
-                                      enable_per_channel_quantized_weight_);
+    if (enable_weight_only_) return;
+    RewriteGemmStyleOp<ConvolutionOp>(
+        entry_func_op, rewriter,
+        enable_per_channel_quantized_weight_ &&
+            IsWeightPerChannelQuantized(quantization_method));
+  }
+
+  // Returns true if the quantization method indicates per-channel quantization
+  // for convolution weights. This method specifically matches a quantization
+  // dimension of 3 for the input index 1.
+  bool IsWeightPerChannelQuantized(const Method& quantization_method) const {
+    if (quantization_method.has_static_range_ptq()) {
+      const StaticRangePtq& static_range_ptq_spec =
+          quantization_method.static_range_ptq();
+
+      if (static_range_ptq_spec.input_quantized_types().contains(1)) {
+        const QuantizedType& weight_quantized_type =
+            static_range_ptq_spec.input_quantized_types().at(1);
+        return weight_quantized_type.dimension_specs().dimension() == 3;
+      }
+    }
+    return false;
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
-// Quantizes the entry function's body containing a `GatherOp`.
-class QuantizeGatherOpPattern : public EntryFuncBodyQuantizationPattern {
+template <typename SingularOpT>
+class QuantizeSingularOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeGatherOpPattern(
-      const bool enable_per_channel_quantized_weight) {}
+  explicit QuantizeSingularOpPattern(
+      const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
-    return MatchSingularOp<GatherOp>(entry_func_op);
+    const auto op_iterator_range = entry_func_op.getOps<SingularOpT>();
+    if (op_iterator_range.empty()) {
+      LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                              << SingularOpT::getOperationName() << " op.\n");
+      return failure();
+    }
+    if (!isa<RankedTensorType>(
+            (*op_iterator_range.begin()).getResult().getType())) {
+      LLVM_DEBUG(llvm::dbgs() << SingularOpT::getOperationName()
+                              << " op must have ranked tensor type.\n");
+      return failure();
+    }
+    return success();
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {
-    RewriteSingularOp<GatherOp>(entry_func_op, rewriter);
+    auto singular_op = *entry_func_op.getOps<SingularOpT>().begin();
+
+    Value singular_op_result = singular_op.getResult();
+    singular_op_result.setType(entry_func_op.getResultTypes()[0]);
   }
 };
 
@@ -528,14 +525,17 @@ class QuantizeGatherOpPattern : public EntryFuncBodyQuantizationPattern {
 // inputs and outputs of `xla_call_module_op` that are possibly quantized. It
 // signature (type) is reset to match that of `xla_call_module_op`.
 // `entry_func_body_quantization_pattern` rewrites the function's body, based on
-// the new signature.
+// the new signature. `quantization_method` specifies the quantization method
+// applied to the quantizable unit `xla_call_module_op` and its corresponding
+// function `entry_func_op`.
 void QuantizeEntryFuncOp(
     const MLIRContext& ctx, PatternRewriter& rewriter,
     const TF::XlaCallModuleOp xla_call_module_op, func::FuncOp entry_func_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
   SetQuantizedFunctionType(rewriter, entry_func_op, xla_call_module_op);
 
-  body_rewrite_pattern.rewrite(entry_func_op, rewriter);
+  body_rewrite_pattern.rewrite(entry_func_op, quantization_method, rewriter);
 
   // Rename the function to be clear that the function has been quantized.
   const std::string quantized_function_name =
@@ -549,13 +549,14 @@ void QuantizeEntryFuncOp(
 void ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
     const MLIRContext& ctx, PatternRewriter& rewriter,
     TF::XlaCallModuleOp xla_call_module_op,
-    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern) {
+    const EntryFuncBodyQuantizationPattern& body_rewrite_pattern,
+    const Method& quantization_method) {
   const ModuleOp module_op = xla_call_module_op->getParentOfType<ModuleOp>();
   const SymbolTable symbol_table(module_op);
 
   func::FuncOp entry_func_op = GetEntryFuncOp(xla_call_module_op, symbol_table);
   QuantizeEntryFuncOp(ctx, rewriter, xla_call_module_op, entry_func_op,
-                      body_rewrite_pattern);
+                      body_rewrite_pattern, quantization_method);
 
   // Replace the XlaCallModuleOp with a new CallOp.
   rewriter.setInsertionPoint(xla_call_module_op);
@@ -581,10 +582,12 @@ template <typename FuncBodyRewritePatternT,
 class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
   explicit XlaCallModuleOpToCallOp(
-      MLIRContext& ctx, const bool enable_per_channel_quantized_weight)
+      MLIRContext& ctx, const bool enable_per_channel_quantized_weight,
+      const bool enable_weight_only)
       : OpRewritePattern<TF::XlaCallModuleOp>(&ctx),
         enable_per_channel_quantized_weight_(
-            enable_per_channel_quantized_weight) {}
+            enable_per_channel_quantized_weight),
+        enable_weight_only_(enable_weight_only) {}
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
@@ -593,24 +596,44 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
     // Ignore unquantized ops.
     if (!IsQuantizedXlaCallModuleOp(op)) return failure();
 
+    // For weight-only quantization, op should be hybrid quantized.
+    if (enable_weight_only_ && !IsHybridQuantizedOp(op)) {
+      return failure();
+    }
+
     func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
     if (!entry_func_op) {
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
-    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+
+    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
+                                   enable_weight_only_)
         .match(entry_func_op);
   }
 
   void rewrite(TF::XlaCallModuleOp xla_call_module_op,
                PatternRewriter& rewriter) const override {
+    // TODO: b/331145946 - Each quantization method should be valid
+    // (GetQuantizationMethodOrDefault swallows invalid method attribute). Check
+    // the validity in `match()`. Use accessors to achieve this.
+    const Method quantization_method =
+        GetQuantizationMethodOrDefault(xla_call_module_op);
+
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_));
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_,
+                                enable_weight_only_),
+        quantization_method);
   }
 
  private:
-  const bool enable_per_channel_quantized_weight_;
+  [[deprecated(
+      "Do not rely on this field for per-channel quantization. Use `Method` "
+      "instead.")]] const bool enable_per_channel_quantized_weight_;
+  // TODO: b/331510853 - Deprecate boolean flag and use `Method` to perform
+  // weight-only quantization.
+  const bool enable_weight_only_;
 };
 
 // Quantizes op with regions such as stablehlo.reduce_window op.
@@ -620,7 +643,7 @@ class QuantizeOpWithRegionPattern
     : public OpRewritePattern<quantfork::DequantizeCastOp> {
  public:
   explicit QuantizeOpWithRegionPattern(MLIRContext& ctx)
-      : OpRewritePattern<quantfork::DequantizeCastOp>(&ctx){};
+      : OpRewritePattern<quantfork::DequantizeCastOp>(&ctx) {};
 
   LogicalResult match(quantfork::DequantizeCastOp op) const final {
     // Match only when there is one user of the dequantize op.
@@ -885,79 +908,50 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
   return false;
 }
 
-class QuantizeHybridDotGeneralPattern
-    : public EntryFuncBodyQuantizationPattern {
+template <typename OpT>
+class QuantizeWeightOnlyOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeHybridDotGeneralPattern() = default;
+  explicit QuantizeWeightOnlyOpPattern(
+      const bool enable_per_channel_quantized_weight) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
-    return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
+    return MatchGemmStyleOp<OpT>(entry_func_op);
   }
 
-  void rewrite(func::FuncOp entry_func_op,
+  void rewrite(func::FuncOp entry_func_op, const Method& quantization_method,
                PatternRewriter& rewriter) const override {}
 };
 
-template <typename FuncBodyRewritePatternT,
-          typename = std::enable_if_t<std::is_base_of_v<
-              EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
-class HybridXlaCallModuleOpToCallOp
-    : public OpRewritePattern<TF::XlaCallModuleOp> {
- public:
-  explicit HybridXlaCallModuleOpToCallOp(
-      MLIRContext& ctx, bool enable_per_channel_quantized_weight)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx){};
-
-  LogicalResult match(TF::XlaCallModuleOp op) const override {
-    ModuleOp module_op = op->getParentOfType<ModuleOp>();
-    SymbolTable symbol_table(module_op);
-
-    // Ignore unquantized ops.
-    if (!IsHybridQuantizedOp(op) || !IsOpQuantizableStableHlo(op)) {
-      return failure();
-    }
-
-    func::FuncOp entry_func_op = GetEntryFuncOp(op, symbol_table);
-    if (!entry_func_op) {
-      op->emitError("Failed to find a valid entry function.");
-      return failure();
-    }
-    return FuncBodyRewritePatternT().match(entry_func_op);
-  }
-
-  void rewrite(TF::XlaCallModuleOp xla_call_module_op,
-               PatternRewriter& rewriter) const override {
-    ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
-        *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT());
-  }
-};
-
-// TODO: b/307620428 - Increase fused op coverage for static range quantization.
-void PopulateFusedGemmStylePatterns(
+// Compute heavy patterns should be quantized for both server and ODML targets.
+void PopulateComputeHeavyPatterns(
     MLIRContext& ctx, RewritePatternSet& patterns,
     const bool enable_per_channel_quantized_weight) {
   patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
-      ctx, enable_per_channel_quantized_weight);
+      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
   patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
-      ctx, enable_per_channel_quantized_weight);
+      ctx, enable_per_channel_quantized_weight, /*enable_weight_only=*/false);
+  // TODO: b/307620772 - Per-channel quantization for gather.
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<GatherOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false,
+      /*enable_weight_only=*/false);
+  // Populate pattern for quantization of ops with regions such as
+  // `stablehlo.reduce_window` op.
+  patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
-void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns) {
-  patterns.add<HybridXlaCallModuleOpToCallOp<QuantizeHybridDotGeneralPattern>>(
-      ctx, false);
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false,
+      /*enable_weight_only=*/false);
 }
 
-void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
-                                         RewritePatternSet& patterns) {
-  patterns.add<QuantizeOpWithRegionPattern>(ctx);
-}
-
-void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
+void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns) {
-  // TODO: b/307620772 - Per-channel quantization for gather.
-  patterns.add<XlaCallModuleOpToCallOp<QuantizeGatherOpPattern>>(
-      ctx, /*enable_per_channel_quantized_weight=*/false);
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>,
+               XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
+      ctx, /*enable_per_channel_quantized_weight*/ false,
+      /*enable_weight_only=*/true);
 }
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 7b681cc71f71e3..9aa33ee0316ee1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -60,14 +60,15 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op);
 // Each matched pattern are rewritten by its quantized alternatives.
 //
 // The concrete pattern, extends from this base pattern, can specify whether it
-// allows hybrid quantization. If it is allowed, for operand/result that is not
-// adjacent to dequantize/quantize op, it remains as float. For operand/result
-// that is adjacent to dequantize/quantize, it is quantized. Hybrid quantization
-// can be used to generate both weight-only quantization and dynamic range
-// quantization. The condition for allowing hybrid quantization or not for an op
-// can be specified in the below function:
+// allows weight-only quantization. If it is allowed, for operand/result that is
+// not adjacent to dequantize/quantize op, it remains as float. For
+// operand/result that is adjacent to dequantize/quantize, it is quantized.
+// Weight-only quantization can be used to generate both weight-only
+// quantization and dynamic range quantization. The condition for allowing
+// weight-only quantization or not for an op can be specified in the below
+// function:
 //
-//    static bool AllowHybridQuantization(Operation& op)
+//    static bool AllowWeightOnlyQuantization(Operation& op)
 //
 // This is a templatized `OpRewritePattern<RootOpT>`.
 //
@@ -177,8 +178,8 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // If the operand is an integer tensor, then it doesn't require the
           // DequantizeOp in the pattern.
           inputs.push_back(operand);
-        } else if (static_cast<const ConcreteT*>(this)->AllowHybridQuantization(
-                       *candidate_op)) {
+        } else if (static_cast<const ConcreteT*>(this)
+                       ->AllowWeightOnlyQuantization(*candidate_op)) {
           inputs.push_back(operand);
         } else {
           return failure();
@@ -214,8 +215,8 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
           // D op in the pattern.
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
-        } else if (static_cast<const ConcreteT*>(this)->AllowHybridQuantization(
-                       *candidate_op)) {
+        } else if (static_cast<const ConcreteT*>(this)
+                       ->AllowWeightOnlyQuantization(*candidate_op)) {
           outputs_replaced.insert({result, enumerated_result.index()});
           output_types.push_back(result.getType());
         } else {
@@ -249,22 +250,17 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
   }
 };
 
-// Gemm Style Op: glossary/gemm.
-void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns,
-                                    bool enable_per_channel_quantized_weight);
+// Populates pattern for compute heavy operations.
+void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
+                                  bool enable_per_channel_quantized_weight);
 
-// Populates pattern for hybrid quantization.
-void PopulateQuantizeHybridPatterns(MLIRContext& ctx,
+// Populates conversion patterns for all quantizable ops, including
+// ops that are not compute-heavy and data movement ops.
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
                                     RewritePatternSet& patterns);
 
-// Populates pattern for quantization of ops with regions such as
-// stablehlo.reduce_window op.
-void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
-                                         RewritePatternSet& patterns);
-
-// Populates conversion patterns for unary data movement ops.
-void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
+// Populates pattern weight-only quantization.
+void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns);
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 048f0f04cff789..8bb2bd33564481 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -57,7 +57,7 @@ struct StableHloQuantizationBase
                                      quantfork::DequantizeCastOp,
                                      /*VerifierT=*/void, RootOpT>(ctx) {}
 
-  static bool AllowHybridQuantization(Operation& op) { return false; }
+  static bool AllowWeightOnlyQuantization(Operation& op) { return false; }
 };
 
 // Quantization rewrite pattern using DQ as the root op.
@@ -77,15 +77,22 @@ struct StableHloQuantizationReverse
                                   quantfork::QuantizeCastOp>(ctx) {}
 };
 
+bool IsHybridQuantizableOp(Operation& op) {
+  auto call_op = cast<TF::XlaCallModuleOp>(op);
+  if (call_op == nullptr) return false;
+  StringRef entry_function_name = GetEntryFunctionName(call_op);
+  return entry_function_name.contains("conv") ||
+         entry_function_name.contains("dot_general");
+}
+
 // Quantization rewrite pattern using DQ as the root op.
-struct StableHloQuantizationHybrid
-    : public StableHloQuantizationBase<StableHloQuantizationHybrid> {
-  explicit StableHloQuantizationHybrid(MLIRContext* ctx)
-      : StableHloQuantizationBase<StableHloQuantizationHybrid>(ctx) {}
-
-  static bool AllowHybridQuantization(Operation& op) {
-    auto call_op = cast<TF::XlaCallModuleOp>(op);
-    return call_op && GetEntryFunctionName(call_op).contains("dot_general");
+struct StableHloQuantizationWeightOnly
+    : public StableHloQuantizationBase<StableHloQuantizationWeightOnly> {
+  explicit StableHloQuantizationWeightOnly(MLIRContext* ctx)
+      : StableHloQuantizationBase<StableHloQuantizationWeightOnly>(ctx) {}
+
+  static bool AllowWeightOnlyQuantization(Operation& op) {
+    return IsHybridQuantizableOp(op);
   }
 };
 
@@ -96,9 +103,10 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
   using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
 
   explicit QuantizePass(const bool enable_per_channel_quantized_weight,
-                        const bool enable_weight_only,
-                        const QuantizationSpecs& quant_specs) {
+                        const bool enable_full_int_quantization,
+                        const bool enable_weight_only) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -113,14 +121,17 @@ void QuantizePass::runOnOperation() {
   RewritePatternSet patterns(&ctx);
   patterns.add<StableHloQuantization, StableHloQuantizationReverse>(&ctx);
   if (enable_weight_only_) {
-    patterns.add<StableHloQuantizationHybrid>(&ctx);
-    PopulateQuantizeHybridPatterns(ctx, patterns);
+    patterns.add<StableHloQuantizationWeightOnly>(&ctx);
+    PopulateQuantizeWeightOnlyPatterns(ctx, patterns);
   }
 
-  PopulateQuantizeOpWithRegionPattern(ctx, patterns);
-  PopulateFusedGemmStylePatterns(ctx, patterns,
-                                 enable_per_channel_quantized_weight_);
-  PopulateQuantizeSingularOpPatterns(ctx, patterns);
+  PopulateComputeHeavyPatterns(ctx, patterns,
+                               enable_per_channel_quantized_weight_);
+
+  // Quantize all quantizable ops, including ops that are not compute-heavy.
+  if (enable_full_int_quantization_) {
+    PopulateAllQuantizablePatterns(ctx, patterns);
+  }
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index 9478cea46c8795..f3cf92dde359d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -55,8 +55,9 @@ class QuantizeCompositeFunctionsPass
 
   explicit QuantizeCompositeFunctionsPass(
       const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only) {
+      const bool enable_weight_only, const bool enable_full_int_quantization) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -80,7 +81,7 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   options.bit_width_ = 8;
 
   if (enable_weight_only_) {
-    pm.addNestedPass<func::FuncOp>(createPrepareQuantizeHybridPass());
+    pm.addNestedPass<func::FuncOp>(createInsertWeightParamPass());
   }
   // PrepareQuantizePass uses SymbolTable to fetch relevant GEMM ops for
   // determining quantization attributes. This requires module-level context.
@@ -89,6 +90,8 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   QuantizePassOptions quantize_options;
   quantize_options.enable_per_channel_quantized_weight_ =
       enable_per_channel_quantized_weight_;
+  quantize_options.enable_full_int_quantization_ =
+      enable_full_int_quantization_;
   quantize_options.enable_weight_only_ = enable_weight_only_;
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
index aa9c2106789f27..a8a59d1cd3b46b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
@@ -19,11 +19,22 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo::testing {
 
+// Identifies predefined `QuantizationSpecs` for
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
+// option argument is specified in line comments for each enum value.
+enum class TestQuantizationSpecs {
+  kEmpty,                         // empty
+  kDisableAllDotGeneral,          // disable-all-dot-general
+  kStaticRangePtqToAll,           // static-range-ptq-to-all
+  kStaticRangePtqToComputeHeavy,  // static-range-ptq-to-compute-heavy
+};
+
 // Adds generated pass default constructors or options definitions.
 #define GEN_PASS_DECL
 // Adds generated pass registration functions.
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h.inc"
+
 }  // namespace mlir::quant::stablehlo::testing
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_TESTING_PASSES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
index 38d60e94f97e9a..ee525f2deead04 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
@@ -69,6 +69,22 @@ def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
     This test-only pass is the same as `LiftQuantizableSpotsAsFunctionsPass` but
     has predefined `QuantizationSpecs` to make FileCheck testing easier.
   }];
+  let options = [
+    Option<"quantization_specs_", "quantization-specs",
+      "mlir::quant::stablehlo::testing::TestQuantizationSpecs",
+      /*default=*/"mlir::quant::stablehlo::testing::TestQuantizationSpecs::kEmpty",
+      "Sets one of the predefined `QuantizationSpecs` for testing.",
+      [{llvm::cl::values(
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kEmpty,
+          "empty", "Uses empty (default) QuantizationSpecs."),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kDisableAllDotGeneral,
+          "disable-all-dot-general", "Disables all dot_general ops by matching lifted function names"),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToAll,
+          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units."),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToComputeHeavy,
+          "static-range-ptq-to-compute-heavy", "Applies `StaticRangePtq` to only compute heavy units.")
+      )}]>
+  ];
   let dependentDialects = [
       "mlir::func::FuncDialect",
       "mlir::stablehlo::StablehloDialect",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
index e8cb185cb7b55d..062fbdddd4150d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
@@ -39,15 +40,37 @@ using ::tsl::protobuf::TextFormat;
 // NOLINTNEXTLINE(misc-include-cleaner) - Required for OSS.
 using ::tsl::protobuf::io::ArrayInputStream;
 
+// Empty (default) `QuantizationSpecs` proto.
+constexpr absl::string_view kSpecsEmpty = R"pb(specs
+                                               [])pb";
+
 // Configure `QuantizationSpecs` to disable quantization for all dot_general
 // quantizable units.
-constexpr absl::string_view kSpecsDisableAllDotGeneralByFuncName =
+constexpr absl::string_view kSpecsDisableAllDotGeneral =
     R"pb(specs
          [ {
            matcher { function_name { regex: "composite_dot_general_.*" } }
            method { no_quantization {} }
          }])pb";
 
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to all quantizable
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToAll =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: ".*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to compute heavy
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToComputeHeavy =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "^.*(conv|dot|gather).*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
 class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
     : public impl::
           TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
@@ -64,9 +87,24 @@ class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
   void runOnOperation() override;
 };
 
+// `TestQuantizationSpecs` -> predefined `QuantizationSpecs` textproto.
+absl::string_view GetQuantizationSpecsTextProto(
+    const TestQuantizationSpecs test_specs) {
+  switch (test_specs) {
+    case TestQuantizationSpecs::kEmpty:
+      return kSpecsEmpty;
+    case TestQuantizationSpecs::kDisableAllDotGeneral:
+      return kSpecsDisableAllDotGeneral;
+    case TestQuantizationSpecs::kStaticRangePtqToAll:
+      return kSpecsStaticRangePtqToAll;
+    case TestQuantizationSpecs::kStaticRangePtqToComputeHeavy:
+      return kSpecsStaticRangePtqToComputeHeavy;
+  }
+}
+
 // Parses a text proto into a `QuantizationSpecs` proto. Returns
 // `InvalidArgumentError` if `text_proto` is invalid.
-absl::StatusOr<QuantizationSpecs> ParseQuantizationSpecsTextProto(
+absl::StatusOr<QuantizationSpecs> ParseTextProto(
     const absl::string_view text_proto) {
   QuantizationSpecs quantization_specs;
   TextFormat::Parser parser;
@@ -81,8 +119,9 @@ void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
     runOnOperation() {
   PassManager pass_manager{&getContext()};
 
+  // Construct `QuantizationSpecs` from the pass option `quantization-specs`.
   const absl::StatusOr<QuantizationSpecs> quantization_specs =
-      ParseQuantizationSpecsTextProto(kSpecsDisableAllDotGeneralByFuncName);
+      ParseTextProto(GetQuantizationSpecsTextProto(quantization_specs_));
   if (!quantization_specs.ok()) {
     signalPassFailure();
     return;
@@ -93,7 +132,6 @@ void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
 
   if (failed(pass_manager.run(getOperation()))) {
     signalPassFailure();
-    return;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
index 06b53035c80c7a..0c41771a5c43b0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_pre_calibration_component.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -34,6 +35,8 @@ namespace mlir::quant::stablehlo::testing {
 
 namespace {
 
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
 
 class TestPreCalibrationComponentPass
@@ -52,7 +55,10 @@ void TestPreCalibrationComponentPass::runOnOperation() {
 
   // Simply runs the PreCalibrationComponent with a default configuration.
   PreCalibrationComponent component(&ctx);
-  if (!component.Run(module_op, QuantizationConfig::default_instance()).ok()) {
+  QuantizationConfig quantization_config{};
+  quantization_config.mutable_static_range_ptq_preset();
+  quantization_config = ExpandPresets(PopulateDefaults(quantization_config));
+  if (!component.Run(module_op, quantization_config).ok()) {
     signalPassFailure();
   }
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
index a9bd3a713ede7c..2b20cc48a89d69 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -133,6 +133,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:weight_only_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 80ccf81c33b9b9..80a2c560ef865b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -64,7 +64,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
               ([10, 1, 1024], [10, 1024, 3]),
               ([2, 3, 1, 1024], [2, 3, 1024, 3]),
           ),
-          'rng_seed': (1230, 1231, 1232, 1233),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -73,7 +72,6 @@ def test_matmul_ptq_model(
       bias_fn: Optional[ops.Operation],
       activation_fn: Optional[ops.Operation],
       dim_sizes: Sequence[int],
-      rng_seed: int,
   ):
     lhs_dim_size, rhs_dim_size = dim_sizes
     input_shape = (*lhs_dim_size,)
@@ -87,7 +85,7 @@ def test_matmul_ptq_model(
         activation_fn,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -144,6 +142,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'same_scale_op': (
@@ -156,14 +162,12 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               'slice',
               'transpose',
           ),
-          'rng_seed': (0, 11, 222, 3333),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_and_same_scale_ptq_model(
       self,
       same_scale_op: str,
-      rng_seed: int,
   ):
     input_shape = (2, 3, 1, 1024)
     filter_shape = (2, 3, 1024, 3)
@@ -176,7 +180,7 @@ def test_matmul_and_same_scale_ptq_model(
         same_scale_op,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -225,6 +229,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'same_scale_op': (
@@ -233,7 +245,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               # TODO: b/326242075 - Support other same-scale ops.
           ),
           'dim_sizes': (([None, 1024], [1024, 3]),),
-          'rng_seed': (0, 11, 222, 3333),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -241,7 +252,6 @@ def test_matmul_and_same_scale_ptq_model_dynamic(
       self,
       same_scale_op: str,
       dim_sizes: Sequence[int],
-      rng_seed: int,
   ):
     input_dim_size, filter_dim_size = dim_sizes
     input_shape = (*input_dim_size,)
@@ -255,7 +265,7 @@ def test_matmul_and_same_scale_ptq_model_dynamic(
         same_scale_op,
     )
 
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -304,6 +314,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.6,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'bias_fn': (
@@ -315,7 +333,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               nn_ops.relu,
               nn_ops.relu6,
           ),
-          'has_batch_norm': (False,),
+          'has_batch_norm': (False, True),
           'input_shape_dynamic': (
               False,
               True,
@@ -324,7 +342,6 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               False,
               True,
           ),
-          'rng_seed': (10, 11, 12, 13),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
@@ -335,7 +352,6 @@ def test_conv_ptq_model(
       has_batch_norm: bool,
       input_shape_dynamic: bool,
       enable_per_channel_quantized_weight: bool,
-      rng_seed: int,
       dilations: Sequence[int] = None,
   ):
     input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
@@ -351,9 +367,18 @@ def test_conv_ptq_model(
         strides,
         dilations,
     )
+    # TODO(b/331809306): investigate why these tests fail.
+    # skip these test cases.
+    if (
+        bias_fn is None
+        and has_batch_norm
+        and input_shape_dynamic
+        and enable_per_channel_quantized_weight
+    ):
+      return
 
     # Generate model input data.
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
@@ -412,19 +437,25 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.05)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.61,
+    )
+
   @parameterized.parameters(
       testing.parameter_combinations([{
           'equation': (
               'abc,cde->abde',
               'abc,dce->abde',
           ),
-          'rng_seed': (82, 82732, 4444, 14),
       }])
   )
   def test_einsum_ptq_model(
       self,
       equation: str,
-      rng_seed: int,
   ):
     _, y_shape, bias_shape, x_signature, y_signature = (
         self._prepare_sample_einsum_datashapes(equation, use_bias=True)
@@ -440,7 +471,7 @@ def test_einsum_ptq_model(
     )
 
     # Generate model input data.
-    rng = np.random.default_rng(rng_seed)
+    rng = np.random.default_rng(seed=42)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=x_signature).astype('f4')
     )
@@ -489,6 +520,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.04)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.65,
+    )
+
   def test_when_preset_not_srq_raises_error(self):
     self._create_matmul_model(
         input_shape=(1, 1024),
@@ -573,6 +612,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # are negligible numeric difference.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.000001)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.4,
+    )
+
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_selective_denylist(self):
     """Tests that the op is not quantized when no quantization is enabled."""
@@ -667,6 +714,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # Indirectly tests that the model is only partially quantized.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.011)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.55,
+    )
+
   @test_util.run_in_graph_and_eager_modes
   def test_ptq_quantization_method_not_applied_when_matcher_mismatch(self):
     """Tests that quantization method is not applied to unmatched units."""
@@ -737,6 +792,14 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.04)
     self.assertNotAllClose(new_outputs, expected_outputs, rtol=0.00001)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.4,
+    )
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
@@ -746,47 +809,49 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   (default in TF2) to ensure support for when TF2 is disabled.
   """
 
-  # TODO(b/307621353): add CALIBRATION_METHOD_HISTOGRAM_PERCENTILE.
   @parameterized.parameters(
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX  # pylint: disable=line-too-long
-              )
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
+          )
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX  # pylint: disable=line-too-long
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
+          ),
+      },
+      {
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
+          ),
       },
       {
-          'calibration_options':
-              qc.CalibrationOptions(
-                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,  # pylint: disable=line-too-long
-                  calibration_parameters=qc.CalibrationOptions.CalibrationParameters(  # pylint: disable=line-too-long
-                      initial_num_bins=10,
-                  ),
+          'calibration_options': qc.CalibrationOptions(
+              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
+              calibration_parameters=qc.CalibrationOptions.CalibrationParameters(
+                  initial_num_bins=10,
               ),
-      }
+          ),
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model_by_calibration_options(
@@ -796,7 +861,7 @@ def test_conv_ptq_model_by_calibration_options(
     bias_fn = nn_ops.bias_add
     activation_fn = nn_ops.relu6
     enable_per_channel_quantized_weight = False
-    has_batch_norm = False
+    has_batch_norm = True
     dilations = None
     input_shape = (1, 3, 4, 3)
     filter_shape = (2, 3, 3, 2)
@@ -814,18 +879,14 @@ def test_conv_ptq_model_by_calibration_options(
 
     # Generate model input data.
     input_data = ops.convert_to_tensor(
-        np.random.uniform(low=0.0, high=10, size=input_shape).astype(
-            'f4'
-        )
+        np.random.uniform(low=0.0, high=10, size=input_shape).astype('f4')
     )
 
     def data_gen() -> repr_dataset.RepresentativeDataset:
       for _ in range(100):
         yield {
             'input_tensor': ops.convert_to_tensor(
-                np.random.uniform(low=0, high=10, size=input_shape).astype(
-                    'f4'
-                )
+                np.random.uniform(low=0, high=10, size=input_shape).astype('f4')
             ),
         }
 
@@ -865,6 +926,199 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.5)
 
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.46,
+    )
+
+
+class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'dim_sizes': (
+              # tf.MatMul cases.
+              ([None, 1024], [1024, 3]),  # dynamic batch dim.
+              ([1, 1024], [1024, 3]),
+              # tf.BatchMatMul cases.
+              ([10, 1, 1024], [10, 1024, 3]),
+              ([2, 3, 1, 1024], [2, 3, 1024, 3]),
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_matmul_weight_only_model(
+      self,
+      bias_fn: Optional[ops.Operation],
+      activation_fn: Optional[ops.Operation],
+      dim_sizes: Sequence[int],
+  ):
+    lhs_dim_size, rhs_dim_size = dim_sizes
+    input_shape = (*lhs_dim_size,)
+    filter_shape = (*rhs_dim_size,)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    model = self._create_matmul_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        bias_fn,
+        activation_fn,
+    )
+
+    rng = np.random.default_rng(1234)
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    config = qc.QuantizationConfig(
+        weight_only_preset=qc.WeightOnlyPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol and atol
+    # values are arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
+
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Tests that the output graph contains subtract and multiply for
+    # dequantization.
+    self.assertTrue(re.search('stablehlo.subtract', module_str))
+    self.assertTrue(re.search('stablehlo.multiply', module_str))
+    # Tests that the output graph contains float dot_general.
+    self.assertTrue(
+        re.search('stablehlo.dot_general.*xf32>.*xf32>.*xf32>', module_str)
+    )
+
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
+    )
+
+  @parameterized.parameters(
+      testing.parameter_combinations([{
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'has_batch_norm': (False,),
+          'input_shape_dynamic': (
+              False,
+              True,
+          ),
+      }])
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_conv_weight_only_model(
+      self,
+      bias_fn: Optional[ops.Operation],
+      activation_fn: Optional[ops.Operation],
+      has_batch_norm: bool,
+      input_shape_dynamic: bool,
+      dilations: Sequence[int] = None,
+  ):
+    input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
+    filter_shape = (2, 3, 3, 2)
+    strides = (1, 1, 1, 1)
+    model = self._create_conv2d_model(
+        input_shape,
+        filter_shape,
+        self._input_saved_model_path,
+        bias_fn,
+        activation_fn,
+        has_batch_norm,
+        strides,
+        dilations,
+    )
+
+    rng = np.random.default_rng(1234)
+    static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
+    input_data = ops.convert_to_tensor(
+        rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
+            np.float32
+        )
+    )
+
+    config = qc.QuantizationConfig(
+        weight_only_preset=qc.WeightOnlyPreset(),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    expected_outputs = model.conv2d(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+    # Tests that the quantized graph outputs similar values. The rtol and atol
+    # values are arbitrary.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
+
+    module_str = self._extract_first_xla_call_module_op(
+        self._output_saved_model_path
+    )
+
+    # Tests that the output graph contains subtract and multiply for
+    # dequantization.
+    self.assertTrue(re.search('stablehlo.subtract', module_str))
+    self.assertTrue(re.search('stablehlo.multiply', module_str))
+    # Tests that the output graph contains float dot_general.
+    self.assertTrue(
+        re.search('stablehlo.convolution.*xf32>.*xf32>.*xf32>', module_str)
+    )
+
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.35,
+    )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
index 8a5f2529c56e22..d71c89e15d313f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -284,13 +284,13 @@ def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         )
         if bias_fn is not None:
           out = nn_ops.bias_add(out, self.bias)
-        if activation_fn is not None:
-          out = activation_fn(out)
         if has_batch_norm:
           # Fusing is supported for non-training case.
           out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
               out, scale, offset, mean, variance, is_training=False
           )
+        if activation_fn is not None:
+          out = activation_fn(out)
         return {'output': out}
 
     model = ConvModel()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
index 6ee6f9ac317ce0..3269006ec06dbb 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
@@ -27,8 +27,10 @@ namespace py = pybind11;
 
 namespace {
 
+using ::stablehlo::quantization::pywrap::PywrapExpandPresets;
 using ::stablehlo::quantization::pywrap::PywrapPopulateDefaults;
 using ::stablehlo::quantization::pywrap::PywrapQuantizeStaticRangePtq;
+using ::stablehlo::quantization::pywrap::PywrapQuantizeWeightOnlyPtq;
 
 }  // namespace
 
@@ -60,6 +62,27 @@ PYBIND11_MODULE(pywrap_quantization, m) {
         py::arg("py_function_library"));
   // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
 
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting should also change.
+  // LINT.IfChange(weight_only_ptq)
+  m.def("weight_only_ptq", &PywrapQuantizeWeightOnlyPtq,
+        R"pbdoc(
+        Runs weight-only Quantization on a SavedModel at `src_saved_model_path`
+        and saves the resulting model to `dst_saved_model_path`.
+
+        The user should pass a serialized `QuantizationConfig` for the
+        `quantization_config_serialized` argument, and a signature key ->
+        serialized `SignatureDef` mapping for the `signature_def_map_serialized`
+        argument.
+
+        Raises `StatusNotOk` exception if when the run was unsuccessful.
+        )pbdoc",
+        py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
+        py::arg("quantization_config_serialized"), py::kw_only(),
+        py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
+        py::arg("py_function_library"));
+  // LINT.ThenChange(pywrap_quantization.pyi:weight_only_ptq)
+
   // If the function signature changes, likely its corresponding .pyi type
   // hinting should also change.
   // LINT.IfChange(populate_default_configs)
@@ -71,5 +94,19 @@ PYBIND11_MODULE(pywrap_quantization, m) {
         default values to fields that the user did not explicitly specify.
         )pbdoc",
         py::arg("user_provided_config_serialized"));
-  // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
+  // LINT.ThenChange(pywrap_quantization.pyi:populate_default_configs)
+
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting should also change.
+  // LINT.IfChange(expand_preset_configs)
+  m.def("expand_preset_configs", &PywrapExpandPresets, R"pbdoc(
+        Expands presets to other fields in `QuantizationConfig`.
+
+        Each preset is expressible by other fields in `QuantizationConfig`.
+        Returns a copy of `QuantizationConfig` (serialized) where the fields are
+        expanded from presets. If no preset has been set, it is a no-op and
+        returns the same copy of the input.
+        )pbdoc",
+        py::arg("quantization_config_serialized"));
+  // LINT.ThenChange(pywrap_quantization.pyi:expand_preset_configs)
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
index f46f44b218ee84..e79e2db2c2ac8f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
@@ -17,7 +17,6 @@ from typing import Any
 from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 
-
 # LINT.IfChange(static_range_ptq)
 def static_range_ptq(
     src_saved_model_path: str,
@@ -31,6 +30,18 @@ def static_range_ptq(
 
 # LINT.ThenChange()
 
+# LINT.IfChange(weight_only_ptq)
+def weight_only_ptq(
+    src_saved_model_path: str,
+    dst_saved_model_path: str,
+    quantization_config_serialized: bytes,
+    *,
+    signature_keys: list[str],
+    signature_def_map_serialized: dict[str, bytes],
+    py_function_library: py_function_lib.PyFunctionLibrary,
+) -> Any: ...  # Status
+
+# LINT.ThenChange()
 
 # LINT.IfChange(populate_default_configs)
 def populate_default_configs(
@@ -38,3 +49,10 @@ def populate_default_configs(
 ) -> bytes: ...  # QuantizationConfig
 
 # LINT.ThenChange()
+
+# LINT.IfChange(expand_preset_configs)
+def expand_preset_configs(
+    quantization_config_serialized: bytes,
+) -> bytes: ...  # QuantizationConfig
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
index 4fe33c60147df7..3b5ece120bdeb0 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.cc
@@ -22,12 +22,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/weight_only_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 
 namespace stablehlo::quantization::pywrap {
 
 using ::mlir::quant::stablehlo::QuantizeStaticRangePtq;
+using ::mlir::quant::stablehlo::QuantizeWeightOnlyPtq;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::PyFunctionLibrary;
 
@@ -46,9 +48,24 @@ absl::Status PywrapQuantizeStaticRangePtq(
                                 py_function_library);
 }
 
+absl::Status PywrapQuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library) {
+  return QuantizeWeightOnlyPtq(src_saved_model_path, dst_saved_model_path,
+                               config, signature_keys, signature_def_map,
+                               py_function_library);
+}
+
 QuantizationConfig PywrapPopulateDefaults(
     const QuantizationConfig& user_provided_config) {
   return PopulateDefaults(user_provided_config);
 }
 
+QuantizationConfig PywrapExpandPresets(const QuantizationConfig& config) {
+  return ExpandPresets(config);
+}
+
 }  // namespace stablehlo::quantization::pywrap
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
index 0f1af29424e79d..ff724abaac5dee 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization_lib.h
@@ -40,11 +40,25 @@ absl::Status PywrapQuantizeStaticRangePtq(
         signature_def_map,
     const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 
+// Function used by the pywrap_quantization module to mirror
+// `::mlir::quant::stablehlo::QuantizeWeightOnlyPtq`.
+absl::Status PywrapQuantizeWeightOnlyPtq(
+    absl::string_view src_saved_model_path,
+    absl::string_view dst_saved_model_path, const QuantizationConfig& config,
+    const std::vector<std::string>& signature_keys,
+    const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
+        signature_def_map,
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
+
 // Function used by the pywrap_quantization module to mirror
 // `::stablehlo::quantization::PopulateDefaults`.
 QuantizationConfig PywrapPopulateDefaults(
     const QuantizationConfig& user_provided_config);
 
+// Function used by the pywrap_quantization module to mirror
+// `::stablehlo::quantization::ExpandPresets`.
+QuantizationConfig PywrapExpandPresets(const QuantizationConfig& config);
+
 }  // namespace stablehlo::quantization::pywrap
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PYTHON_PYWRAP_QUANTIZATION_LIB_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
index 6938000deaae0e..aa3745a3fdd453 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -63,15 +63,24 @@ def quantize_saved_model(
   if not (
       config.HasField('static_range_ptq_preset')
       and len(config.static_range_ptq_preset.representative_datasets) == 1
-  ):
+  ) and not config.HasField('weight_only_preset'):
     raise ValueError(
         '`quantize_saved_model` currently only supports static-range PTQ with a'
-        ' single signature.'
+        ' single signature or weight-only quantization.'
     )
 
+  # Updates user-provided `QuantizationConfig`s for the internal quantization
+  # pipeline to work with.
+  print('=== User-provided QuantizationConfig ===')
+  print(config)
   config = qc.QuantizationConfig.FromString(
       pywrap_quantization.populate_default_configs(config.SerializeToString())
   )
+  config = qc.QuantizationConfig.FromString(
+      pywrap_quantization.expand_preset_configs(config.SerializeToString())
+  )
+  print('=== Updated QuantizationConfig ===')
+  print(config)
 
   signature_def_map = save_model.get_signatures_from_saved_model(
       src_saved_model_path,
@@ -80,11 +89,21 @@ def quantize_saved_model(
   )
 
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
-  pywrap_quantization.static_range_ptq(
-      src_saved_model_path,
-      dst_saved_model_path,
-      quantization_config_serialized=config.SerializeToString(),
-      signature_keys=list(signature_def_map.keys()),
-      signature_def_map_serialized=signature_def_map_serialized,
-      py_function_library=py_function_lib.PyFunctionLibrary(),
-  )
+  if config.HasField('static_range_ptq_preset'):
+    pywrap_quantization.static_range_ptq(
+        src_saved_model_path,
+        dst_saved_model_path,
+        quantization_config_serialized=config.SerializeToString(),
+        signature_keys=list(signature_def_map.keys()),
+        signature_def_map_serialized=signature_def_map_serialized,
+        py_function_library=py_function_lib.PyFunctionLibrary(),
+    )
+  elif config.HasField('weight_only_preset'):
+    pywrap_quantization.weight_only_ptq(
+        src_saved_model_path,
+        dst_saved_model_path,
+        quantization_config_serialized=config.SerializeToString(),
+        signature_keys=list(signature_def_map.keys()),
+        signature_def_map_serialized=signature_def_map_serialized,
+        py_function_library=py_function_lib.PyFunctionLibrary(),
+    )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 81aff6e46d5850..efdceebd6c2008 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -28,23 +28,58 @@ message RepresentativeDatasetConfig {
 }
 
 // Preset config for static-range post-training quantization (PTQ).
+//
 // Minimal user input about representative datasets is required. Representative
 // datasets are required for static-range PTQ to retrieve quantization
 // statistics via calibration.
-// Next ID: 3
+//
+// This preset is equivalent to the following `QuantizationSpecs`:
+//
+// ```
+// specs {matcher {function_name {regex: ".*"}} method {static_range_ptq {}}}
+// specs {
+//   matcher {function_name {regex: "composite_conv.*"}}
+//   method {static_range_ptq {
+//     input_quantized_types {
+//       key: 1
+//       value {dimension_specs {dimension: 3}}}
+//   }}
+// }
+// ```
+//
+// This preset:
+//   * Applies per-channel quantization for weights (input index 1) of
+//     convolution quantizable unit family. The quantization dimension is 3, the
+//     channel dimension, which assumes the weight tensor is in NHWC format.
+//   * Applies static-range PTQ for all other ops.
+//
+// Next ID: 4
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
+  // If `QuantizationConfig.calibration_options.representative_datasets` is also
+  // provided then this field will be ignored.
   repeated RepresentativeDatasetConfig representative_datasets = 1;
 
   // NOTE: This field will be deprecated.
-  // Granularity should be controlled in custom configuration, deprecating
-  // this field once available.
-  // If set true, enable channel-wise quantization for all supported ops.
-  // This value is true by default.
-  bool enable_per_channel_quantized_weight = 2;
+  // Granularity should be controlled using `Method`, deprecating this field
+  // once available.
+  //
+  // If set to true, enable channel-wise quantization for:
+  //   * Convolution ops: When the attached `Method` also specifies per-channel
+  //                      quantization.
+  //   * Non-convolution ops: All
+  //
+  // Default value: true
+  bool enable_per_channel_quantized_weight = 2 [deprecated = true];
+
+  // Whether to quantize all quantizable ops or only compute-heavy ops.
+  bool enable_full_int_quantization = 3;
 }
 
+// Applies int8 per-tensor weight-only quantization for all dot_general op.
+message WeightOnlyPreset {}
+
 // Metadata specific to the input TensorFlow SavedModel, which may be required
 // to identify the specific MetaGraphDef to quantize, for example.
 // Next ID: 2
@@ -63,10 +98,66 @@ message PipelineConfig {
   optional bool unpack_quantized_types = 1;
 }
 
+// Represents a single quantizable unit, a (nearly) minimum unit of work when
+// applying quantization. It may correspond to a single or multiple ops.
+// Next ID: 2
+message QuantizableUnit {
+  // Name of the `FuncOp` symbol corresponding to the "lifted function",
+  // representing a single quantizable unit. This value is guaranteed to be
+  // unique across a single `ModuleOp`.
+  string name = 1;
+}
+
+// Represents a quantization result of a single `QuantizableUnit`. It is
+// essentially a `(QuantizableUnit, Method)` pair, where the `Method`
+// corresponds to the quantization method eventually applied to the
+// `QuantizableUnit`.
+// Next ID: 3
+message QuantizationResult {
+  QuantizableUnit quantizable_unit = 1;
+  Method method = 2;
+}
+
+// A series of `QuantizationResult`s. See `QuantizationResult` for details.
+// Next ID: 2
+message QuantizationResults {
+  repeated QuantizationResult results = 1;
+}
+
+message QuantizedDimension {
+  int32 dimension = 1;  // Should be less than the rank of the quantized tensor.
+}
+
+// Corresponds to StableHLO's `QuantizedTensorElementType`. Type parameters such
+// as `QuantizationParameters` is omitted because they are determined during
+// quantization.
+// See https://github.com/openxla/stablehlo/blob/main/docs/spec.md#types for
+// details.
+//
+// Currently only supports specifying quantization granularity (e.g. for
+// per-channel quantization).
+// TODO: b/331144430 - Support specifying storage types.
+message QuantizedType {
+  // Specifies the granularity of quantization parameters for each dimension of
+  // a quantized tensor. If specified, per-channel quantization is applied. If
+  // not specified, per-tensor quantization is applied.
+  // TODO: Make it a `repeated` field to be able to express multi-channel /
+  // sub-channel quantization.
+  QuantizedDimension dimension_specs = 1;
+}
+
 // A quantization method representing "do not quantize". Mostly used for
 // denylisting quantizable units from quantization.
 message NoQuantization {}
 
+// Configurations for static-range post-training quantization method on a
+// quantizable unit.
+message StaticRangePtq {
+  // Operand index -> QuantizedType mapping. Operands that are not specified
+  // here will be quantized with best effort.
+  map<int32, QuantizedType> input_quantized_types = 1;
+}
+
 // Represents a matching method that matches quantizable units by lifted
 // functions' names.
 message FunctionNameMatcherSpec {
@@ -84,7 +175,10 @@ message MatcherSpec {
 
 // Specifies how to quantize matched quantizable units.
 message Method {
-  NoQuantization no_quantization = 1;
+  oneof method {
+    NoQuantization no_quantization = 1;
+    StaticRangePtq static_range_ptq = 2;
+  }
 }
 
 // A QuantizationSpec is essentially a (matcher spec, quantization method) pair,
@@ -158,9 +252,10 @@ message DebuggerConfig {
 }
 
 // Defines various calibration options.
+// Next ID: 4
 message CalibrationOptions {
   // Configurations for calibration methods.
-  // NEXT ID: 7
+  // Next ID: 7
   enum CalibrationMethod {
     CALIBRATION_METHOD_UNSPECIFIED = 0;
     // Use the min, max values of all sample datasets.
@@ -185,7 +280,7 @@ message CalibrationOptions {
   }
 
   // Parameters required for calibration.
-  // NEXT ID: 4
+  // Next ID: 4
   message CalibrationParameters {
     // The number of bins when histogram is initialized. It can be increased
     // because histogram is dynamically expanded by sample inputs.
@@ -200,7 +295,7 @@ message CalibrationOptions {
   }
 
   // Determines how to calibrate.
-  // The default calibration method is MIN_MAX.
+  // Default value: CALIBRATION_METHOD_MIN_MAX
   CalibrationMethod calibration_method = 1;
 
   // Defines the parameters required for calibration. Parameters such as the
@@ -208,21 +303,26 @@ message CalibrationOptions {
   // MIN_MAX and AVERAGE_MIN_MAX don't require this parameter and methods
   // starting with HISTOGRAM require this parameter.
   CalibrationParameters calibration_parameters = 2;
+
+  // Configures representative dataset. Each item corresponds to a
+  // representative dataset used to calibrate a function.
+  repeated RepresentativeDatasetConfig representative_datasets = 3;
 }
 
 // Quantization configuration for StableHLO Quantizer. This is the primary
 // message containing all configurable options.
-// Next ID: 7
+// Next ID: 8
 message QuantizationConfig {
   // Config presets provide predefined popular or common quantization specs.
   // Lightweight users may choose one of the presets for quick experiments. Each
-  // preset is completely represented by `QuantizationSpecs`. When extra entries
-  // in `QuantizationSpecs` are provided along with a preset, then the preset
-  // will be overridden for the quantizable units matched by those additional
-  // `QuantizationSpec`s.
+  // preset is completely represented by other fields in `QuantizationConfig`.
+  //
+  // When extra entries in `QuantizationSpecs` are provided along with a preset,
+  // then those entries will take precedence.
   oneof preset {
     // Performs best-effort static-range post-training quantization (PTQ).
     StaticRangePtqPreset static_range_ptq_preset = 1;
+    WeightOnlyPreset weight_only_preset = 7;
   }
 
   // TF SavedModel specific information for the input model.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
index db4bc1a92483c1..55a41d4ce76072 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
index 6a5b58a7ba7b64..1fe56cde49601d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
@@ -8,10 +8,10 @@ func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @main(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}}  : (tensor<1x4xf32>) -> tensor<1x4xf32>
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
@@ -28,10 +28,10 @@ func.func @serving_default(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> {
 }
 // CHECK: @serving_default(%[[ARG_0:.+]]: tensor<1x4xf32>) -> tensor<1x3xf32>
 // CHECK-DAG: %[[CST:.+]] = stablehlo.constant dense<1.000000e+00> : tensor<4x3xf32>
-// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_0:.+]] = "tf.CustomAggregator"(%[[ARG_0]]) <{id = "0"}> {{.*}} : (tensor<1x4xf32>) -> tensor<1x4xf32>
 // CHECK: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%[[CUSTOM_AGGREGATOR_0]], %[[CST]])
 // CHECK-SAME: _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1"
-// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {calibration_method = 0 : i32, {{.*}}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+// CHECK: %[[CUSTOM_AGGREGATOR_1:.+]] = "tf.CustomAggregator"(%[[XLA_CALL_MODULE]]) <{id = "1"}> {{.*}} : (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK: return %[[CUSTOM_AGGREGATOR_1]] : tensor<1x3xf32>
 // CHECK: }
 // CHECK: }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
index 55ff087240a5e0..240b10d8438431 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
@@ -1,17 +1,16 @@
 // RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics -stablehlo-test-tf-to-stablehlo | FileCheck %s
 
-func.func @fused_batchnorm_no_training() -> (tensor<1x1x2x8xf32>) {
-  %cst_0 = "tf.Const"() {value = dense<[[[[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]]]]> : tensor<1x1x2x8xf32>} : () -> tensor<1x1x2x8xf32>
-  %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
-  %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]> : tensor<8xf32>} : () -> tensor<8xf32>
-  %0:6 = "tf.FusedBatchNormV3"(%cst_0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  func.return %0#0 : tensor<1x1x2x8xf32>
-}
-// CHECK: func.func @main() -> tensor<1x1x2x8xf32>
-// CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<{{.*}}> : tensor<1x1x2x8xf32>
-// CHECK: return %[[CONST]] : tensor<1x1x2x8xf32>
-
-// -----
+// TODO(b/330759552): Fix the msan issue and enable this test.
+// func.func @fused_batchnorm_no_training() -> tensor<1x1x2x8xf32> {
+//   %cst_0 = "tf.Const"() {value = dense<[[[[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2], [0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]]]]> : tensor<1x1x2x8xf32>} : () -> tensor<1x1x2x8xf32>
+//   %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
+//   %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]> : tensor<8xf32>} : () -> tensor<8xf32>
+//   %0:6 = "tf.FusedBatchNormV3"(%cst_0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<1x1x2x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+//   func.return %0#0 : tensor<1x1x2x8xf32>
+// }
+// COM: CHECK: func.func @main() -> tensor<1x1x2x8xf32>
+// COM: CHECK-DAG: %[[CONST:.*]] = stablehlo.constant dense<{{.*}}> : tensor<1x1x2x8xf32>
+// COM: CHECK: return %[[CONST]] : tensor<1x1x2x8xf32>
 
 func.func @fused_batchnorm_no_training_arg_input(%arg_0: tensor<1x1x2x8xf32>) -> (tensor<1x1x2x8xf32>) {
   %cst_0 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
new file mode 100644
index 00000000000000..96b270f8b888f9
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/defer_activation_transpose.mlir
@@ -0,0 +1,307 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-defer-activation-transpose \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that an `add(transpose(arg0), arg1)` pattern is converted to
+// `transpose(add(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.add` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: add_with_activation_transpose
+func.func @add_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that an `add(transpose(arg0), broadcast_in_dim(arg1))` pattern is
+// converted to `transpose(add(arg0, transpose(broadcast_in_dim(arg1))))`.
+// The transpose in the activation is deferred to the output of `stablehlo.add`
+// and an extra transpose op is inserted to the RHS to match the shape of the
+// operand.
+
+// CHECK-LABEL: add_with_activation_transpose_broadcasted_rhs
+func.func @add_with_activation_transpose_broadcasted_rhs(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
+  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x3x3xf32>
+  return %3 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[BROADCAST:.+]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [1] : (tensor<4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[BROADCAST]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose permutation is not
+// `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_permutation_mismatch
+func.func @add_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.add %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose whose rank is not 4 is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_rank_two
+func.func @add_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.add %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// [No change] Tests that the right-hand side that is not a constant is not
+// deferred.
+
+// CHECK-LABEL: add_with_activation_transpose_nonconst_rhs
+func.func @add_with_activation_transpose_nonconst_rhs(%arg0: tensor<1x3x3x4xf32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %1 = stablehlo.add %0, %arg1 : tensor<1x4x3x3xf32>
+  return %1 : tensor<1x4x3x3xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[ADD_0]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+func.func @reduce_window_max_activation_transpose(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 1, 1, 2, 2>, window_strides = array<i64: 1, 1, 2, 2>} : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides are also
+// permutated to match the new input shape.
+// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
+// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x8x8x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x8x8x4xf32>) -> tensor<1x4x8x8xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// Tests that the transpose of the input of `stablehlo.reduce_window` is
+// deferred to the result. The attributes are permutated according to the new
+// input shape. This test is similar to the test above with the difference that
+// the `stablehlo.reduce_window` has explicit optional attributes:
+// `base_dilations` and `window_dilations`.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_explicit_optional_attrs
+func.func @reduce_window_max_activation_transpose_explicit_optional_attrs(
+      %arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x15x15xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    base_dilations = array<i64: 1, 1, 2, 2>,
+    window_dilations = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x15x15xf32>
+  return %2 : tensor<1x4x15x15xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000>
+
+// Check that the body is not modified.
+// CHECK: %[[REDUCE_WINDOW:.+]] = "stablehlo.reduce_window"(%[[ARG]], %[[INIT_VALUE_CONST]])
+// CHECK: ^bb0(%[[REDUCE_ARG_0:.+]]: tensor<f32>, %[[REDUCE_ARG_1:.+]]: tensor<f32>):
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[REDUCE_ARG_0]], %[[REDUCE_ARG_1]]
+// CHECK: stablehlo.return %[[MAX]]
+
+// Check that the attributes window_dimensions & window_strides along with
+// optional attributes base_dilations and window_dilations are also permutated
+// to match the new input shape.
+// CHECK: {base_dilations = array<i64: 1, 2, 2, 1>, window_dilations = array<i64: 1, 2, 2, 1>, window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>}
+// CHECK-SAME: (tensor<1x16x16x4xf32>, tensor<f32>) -> tensor<1x15x15x4xf32>
+
+// Check that a `stablehlo.transpose` is added to the result to match the shape
+// of the users.
+// CHECK: %[[TRANSPOSE:.+]] = stablehlo.transpose %[[REDUCE_WINDOW]], dims = [0, 3, 1, 2] : (tensor<1x15x15x4xf32>) -> tensor<1x4x15x15xf32>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the input
+// tensor does not have rank 4.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose
+// CHECK-SAME: (%[[ARG:.+]]: tensor<16x8xf32>) -> tensor<4x8xf32>
+func.func @reduce_window_max_activation_transpose_rank2(%arg0: tensor<16x8xf32>) -> tensor<4x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<16x8xf32>) -> tensor<8x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {window_dimensions = array<i64: 2, 2>, window_strides = array<i64: 2, 2>} : (tensor<8x16xf32>, tensor<f32>) -> tensor<4x8xf32>
+  return %2 : tensor<4x8xf32>
+}
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when it has an
+// explicit `padding` attribute.
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<1x16x16x4xf32>) -> tensor<1x4x9x9xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x16x16x4xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>,
+    padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x9x9xf32>
+  return %2 : tensor<1x4x9x9xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x16x16x4xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// [No change] Tests that the transpose of the input of
+// `stablehlo.reduce_window` is NOT deferred to the result, when the transpose
+// isn't `[0, 3, 1, 2]` (i.e. NCHW->NHWC).
+
+// CHECK-LABEL: reduce_window_max_activation_transpose_with_padding
+func.func @reduce_window_max_activation_transpose_with_padding(%arg0: tensor<16x16x4x1xf32>) -> tensor<1x4x8x8xf32> {
+  %0 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %1 = stablehlo.transpose %arg0, dims = [3, 2, 1, 0] : (tensor<16x16x4x1xf32>) -> tensor<1x4x16x16xf32>
+  %2 = "stablehlo.reduce_window"(%1, %0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %3 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %3 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x16x16xf32>, tensor<f32>) -> tensor<1x4x8x8xf32>
+  return %2 : tensor<1x4x8x8xf32>
+}
+// CHECK-SAME: %[[ARG:.+]]: tensor<16x16x4x1xf32>
+// CHECK-DAG: stablehlo.constant
+// CHECK: stablehlo.transpose %[[ARG]]
+// CHECK: stablehlo.reduce_window
+
+// -----
+
+// Tests that an `max(transpose(arg0), arg1)` pattern is converted to
+// `transpose(max(arg0, transpose(arg1)))`. The transpose in the activation is
+// deferred to the output of `stablehlo.max` and an extra transpose op is
+// inserted to the RHS to match the shape of the operand.
+
+// CHECK-LABEL: max_with_activation_transpose
+func.func @max_with_activation_transpose(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x4x3x3xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 3, 1, 2] : (tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x4x3x3xf32>
+  return %2 : tensor<1x4x3x3xf32>
+}
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x3x4xf32>) -> tensor<1x4x3x3xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[CONST_0]], dims = [0, 2, 3, 1] : (tensor<1x4x3x3xf32>) -> tensor<1x3x3x4xf32>
+
+// Check that the shape of the add is changed to reflect the deferred transpose.
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[ARG_0]], %[[TRANSPOSE_0]] : tensor<1x3x3x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// permutation is not `[0, 3, 1, 2]` is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_permutation_mismatch
+func.func @max_with_activation_transpose_permutation_mismatch(
+      %arg0: tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x2x4xf32>
+  %1 = stablehlo.transpose %arg0, dims = [0, 2, 1, 3] : (tensor<1x2x3x4xf32>) -> tensor<1x3x2x4xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<1x3x2x4xf32>
+  return %2 : tensor<1x3x2x4xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
+
+// -----
+
+// [No change] Tests that the activation transpose of `stablehlo.maximum` whose
+// rank is not 4 is not deferred.
+
+// CHECK-LABEL: max_with_activation_transpose_rank_two
+func.func @max_with_activation_transpose_rank_two(%arg0: tensor<1x2xf32>) -> tensor<2x1xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x1xf32>
+  %1 = stablehlo.transpose %arg0, dims = [1, 0] : (tensor<1x2xf32>) -> tensor<2x1xf32>
+  %2 = stablehlo.maximum %1, %0 : tensor<2x1xf32>
+  return %2 : tensor<2x1xf32>
+}
+// CHECK: %[[TRANSPOSE_0:.+]] = stablehlo.transpose
+// CHECK: %[[MAX_0:.+]] = stablehlo.maximum %[[TRANSPOSE_0]], {{.*}}
+// CHECK: return %[[MAX_0]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
index 9f68899873f0b0..89ff96efecf471 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize_hybrid.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/insert_weight_param.mlir
@@ -1,4 +1,4 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize-hybrid | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-insert-weight-param | FileCheck %s
 
 // Test that q/dq pair is inserted between constant and XlaCallModule op
 // with quantizable trait and function name containing conv.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
index a0d797cfee4fa2..69bf09104c814d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -1,7 +1,78 @@
-// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs \
-// RUN:   -split-input-file | FileCheck %s
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=disable-all-dot-general" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=DISABLE-ALL-DOT-GENERAL
 
-// CHECK: @main
+// Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
+// contains attributes required for quantization, including the
+// `_quantization_method` attribute that contains textpb of `Method`.
+
+// DISABLE-ALL-DOT-GENERAL: @main
+func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+
+// DISABLE-ALL-DOT-GENERAL: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// DISABLE-ALL-DOT-GENERAL: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute contains the quantization
+// method in textproto format. The dot_general op quantization is explicitly
+// disabled by having `_quantization_method = "no_quantization { }"`.
+// DISABLE-ALL-DOT-GENERAL-SAME: _entry_function = @composite_dot_general_fn_1
+// DISABLE-ALL-DOT-GENERAL-SAME: _original_entry_function
+// DISABLE-ALL-DOT-GENERAL-SAME: _quantization_method = "no_quantization { }"
+// DISABLE-ALL-DOT-GENERAL-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// DISABLE-ALL-DOT-GENERAL: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: }
+
+// DISABLE-ALL-DOT-GENERAL-LABEL: private @composite_dot_general_fn_1
+// DISABLE-ALL-DOT-GENERAL-SAME: tf_quant.composite_function
+// DISABLE-ALL-DOT-GENERAL: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// DISABLE-ALL-DOT-GENERAL: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
+// DISABLE-ALL-DOT-GENERAL: }
+
+// -----
+
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=empty" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=EMPTY
+
+// Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
+// contains attributes required for quantization. `_quantization_method` is not
+// set, as it is implicitly disabled.
+
+// EMPTY: @main
+func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+
+// EMPTY: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// EMPTY: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute doesn't contain the
+// quantization method, implying "no_quantization".
+// EMPTY-SAME: _entry_function = @composite_dot_general_fn_1
+// EMPTY-SAME: _original_entry_function
+// EMPTY-NOT: _quantization_method
+// EMPTY-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// EMPTY: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
+// EMPTY: }
+
+// EMPTY-LABEL: private @composite_dot_general_fn_1
+// EMPTY-SAME: tf_quant.composite_function
+// EMPTY: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// EMPTY: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
+// EMPTY: }
+
+// -----
+
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-all" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-ALL
+
+// STATIC-RANGE-PTQ-TO-ALL: @main
 func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
@@ -11,21 +82,44 @@ func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 // contains attributes required for quantization, including the
 // `_quantization_method` attribute that contains textpb of `Method`.
 
-// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
-// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// STATIC-RANGE-PTQ-TO-ALL: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-ALL: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+
+// Check that the `_quantization_method` attribute contains the quantization
+// method in textproto format, enabling static-range PTQ.
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _entry_function = @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _original_entry_function
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _quantization_method = "static_range_ptq { }"
+// STATIC-RANGE-PTQ-TO-ALL-SAME: _tfl_quant_trait = "fully_quantizable"
+
+// STATIC-RANGE-PTQ-TO-ALL: return %[[XLA_CALL_MODULE:.+]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }
+
+// STATIC-RANGE-PTQ-TO-ALL-LABEL: private @composite_dot_general_fn_1
+// STATIC-RANGE-PTQ-TO-ALL-SAME: tf_quant.composite_function
+// STATIC-RANGE-PTQ-TO-ALL: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1
+// STATIC-RANGE-PTQ-TO-ALL: return %[[DOT_GENERAL:.+]] : tensor<1x1x64xf32>
+// STATIC-RANGE-PTQ-TO-ALL: }
+
+// -----
+
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs="quantization-specs=static-range-ptq-to-compute-heavy" \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY
+
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: @main
+func.func @main(%arg0: tensor<1x2xf32>) -> tensor<1x2xf32> {
+  %0 = stablehlo.add %arg0, %arg0 : tensor<1x2xf32>
+  return %0 : tensor<1x2xf32>
+}
+// Tests that `composite_add_fn_1` does not quantize when quantizing
+// only compute-heavy ops.
+
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: %[[CONST:.+]] = stablehlo.constant dense<2.000000e+00>
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: %[[XLA_CALL_MODULE:.+]] = "tf.XlaCallModule"(%arg0, %arg0)
 
 // Check that the `_quantization_method` attribute contains the quantization
-// method in textproto format.
-// CHECK-SAME: _entry_function = @composite_dot_general_fn_1
-// CHECK-SAME: _original_entry_function
-// CHECK-SAME: _quantization_method = "no_quantization {}"
-// CHECK-SAME: _tfl_quant_trait = "fully_quantizable"
-
-// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
-// CHECK: }
-
-// CHECK-LABEL: private @composite_dot_general_fn_1
-// CHECK-SAME: tf_quant.composite_function
-// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// CHECK: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
-// CHECK: }
+// method in textproto format, enabling static-range PTQ.
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _entry_function = @composite_add_fn_1
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _original_entry_function
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY-NOT: _quantization_method
+// STATIC-RANGE-PTQ-TO-COMPUTE-HEAVY: _tfl_quant_trait = "fully_quantizable"
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
index 6cdf9fdbf46b91..bdfce8cad3f5a8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/nchw_convolution_to_nhwc.mlir
@@ -75,8 +75,8 @@ func.func @conv_output_dim_numbers_mismatch(%arg0: tensor<1x8x4x4xf32>) -> tenso
 // Tests that a quantized convolution does not match. No conversion occurs.
 
 // CHECK-LABEL: quantized_convolution
-func.func @quantized_convolution(%arg0: tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<2x4x3x3x!quant.uniform<i8:f32:1, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>> {
-  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<2x4x3x3x!quant.uniform<i8:f32:1, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
+func.func @quantized_convolution(%arg0: tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>> {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x3x3x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<2x4x3x3x!quant.uniform<i8:f32:0, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
   return %0 : tensor<1x2x3x3x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
index 9b3c6f0f0ae04f..1ff62b1170a6f5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
@@ -17,8 +17,11 @@ module {
     // CHECK: "tf.XlaCallModule"(%[[dq_act]], %[[dq_weight]]
     %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
       Sout = [#tf_type.shape<1x2x2x2>], config = "",
-      _entry_function = @composite_conv2d_with_bias_and_relu6_fn_10,
       module = "composite_conv2d_with_bias_and_relu6_fn_10",
+      _entry_function = @composite_conv2d_with_bias_and_relu6_fn_10,
+      // Represents a per-channel quantization for the operand index 1 with
+      // quantization dimension of 3
+      _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
       platforms = [], version = 4 : i64
     } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
@@ -90,3 +93,38 @@ module {
     return %0 : tensor<2x2xf32>
   }
 }
+
+// -----
+
+// Tests that the `PrepareQuantizePass` prepares for per-tensor quantization for
+// the weight of convolution. This is based on the `_quantization_method` that
+// does not have a `input_quantized_types` with a specified `dimension_specs`.
+
+// CHECK-LABEL: conv_per_tensor_quantized_method
+func.func private @conv_per_tensor_quantized_method(%arg0: tensor<1x3x2x3xf32>) -> tensor<1x2x2x2xf32> {
+  %cst = "tf.Const"() {device = "", value = dense<[7.11401462, 7.05456924]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_0 = "tf.Const"() {device = "", value = dense<[[[[-6.30731344, 5.4962182], [1.80364347, -7.64542675], [-2.11145878, -7.08605719]], [[-9.54062747, -6.14013147], [6.12640238, -4.18223286], [5.05738974, 8.99269962]], [[3.3535192, 0.84816426], [-6.64676809, -7.95477629], [5.81315517, 9.21566581]]], [[[1.38622558, 4.63866329], [4.54742622, -1.43770897], [-3.96835279, 2.99996852]], [[0.989735424, -4.83384752], [-7.27702999, 1.17216611], [1.33735656, 0.728900194]], [[5.1286211, 8.98645591], [1.55008793, -3.85491467], [3.7003777, 9.26594448]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[1.27501142, 4.824783]> : tensor<2xf32>} : (tensor<1x3x2x3xf32>) -> tensor<1x3x2x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst_0, %cst) {
+    Sout = [#tf_type.shape<1x2x2x2>], config = "",
+    module = "composite_conv_fn_1",
+    _entry_function = @composite_conv_fn_1,
+    _quantization_method = "static_range_ptq {}",
+    platforms = [], version = 4 : i64
+  } : (tensor<1x3x2x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x2x2x2xf32>
+  %2 = "quantfork.stats"(%1) {layerStats = dense<[0.000000e+00, 6.000000e+00]> : tensor<2xf32>} : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+  return %2 : tensor<1x2x2x2xf32>
+}
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x3x2x3xf32>
+
+// Test that the weight is prepared for per-tensor quantization, based on the
+// `_quantization_method` attribute without a `dimension_specs` field in
+// `QuantizedType`.
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} tensor<2x3x3x2xf32>
+// CHECK: %[[Q_WEIGHT_PER_TENSOR:.*]] = "quantfork.qcast"(%[[WEIGHT_CONST]]) {{.*}} (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[DQ_WEIGHT:.*]] = "quantfork.dcast"(%[[Q_WEIGHT_PER_TENSOR]])
+
+// CHECK: %[[Q_ACTIVATION:.*]] = "quantfork.qcast"(%[[ARG_0]])
+// CHECK-SAME: -> tensor<1x3x2x3x!quant.uniform<i8:f32, 0.018920717052384919:-128>>
+// CHECK: %[[DQ_ACTIVATION:.*]] = "quantfork.dcast"(%[[Q_ACTIVATION]])
+// CHECK: "tf.XlaCallModule"(%[[DQ_ACTIVATION]], %[[DQ_WEIGHT]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_hybrid.mlir
deleted file mode 100644
index f9a6aaea3a500f..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_hybrid.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize=enable-weight-only=true | FileCheck %s
-
-// Test that hybrid quantized op is produced when q/dq pair only exists for weight.
-
-module attributes {tf_saved_model.semantics} {
-  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-    %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-    %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3xf32>
-    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %2 : tensor<1x3xf32>
-  }
-
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-}
-
-// CHECK-LABEL: quantize_dot_general_fn
-// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
-// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
-// CHECK: return %[[CALL]]
-
-// CHECK: quantized_dot_general_fn
-// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
-// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
-// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
-// CHECK: return %[[DOT]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
new file mode 100644
index 00000000000000..6db474de676ccc
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_weight_only.mlir
@@ -0,0 +1,65 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-quantize=enable-weight-only=true | FileCheck %s
+
+// Test that hybrid quantized dot_general is produced when q/dq pair only exists
+// for weight.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %0 = "quantfork.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    %1 = "quantfork.dcast"(%0) : (tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[Q]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that hybrid quantized convolution is produced when q/dq pair only exists
+// for weight.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %0 = "quantfork.qcast"(%cst) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+    %1 = "quantfork.dcast"(%0) : (tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<2x3x3x2xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+// CHECK: %[[Q:.+]] = "quantfork.qcast"(%[[CST]]) : (tensor<2x3x3x2xf32>) -> tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[Q]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 6.000000e-03:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
index 13570eb583110e..f9fa9ce5f60b87 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -187,13 +187,31 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that basic convolution is properly quantized.
+// Tests that basic convolution is properly quantized. It is per-channel
+// quantized unless `enable-per-channel-quantized-weight=false`, according to
+// `_quantization_method` with an `input_quantized_types` and explicit
+// `dimension_specs`.
 
 module attributes {tf_saved_model.semantics} {
   func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64, _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64,
+        _entry_function = @composite_conv_fn,
+        _original_entry_function = "composite_conv_fn",
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _stablehlo_module_attrs = {},
+        _tfl_quant_trait = "fully_quantizable",
+        device = ""
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -235,6 +253,58 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
+// Tests that basic convolution is properly quantized. In this example, the
+// convolution is always per-tensor quantized (even if
+// enable-per-channel-quantized-weights=true), according to
+// `_quantization_method`.
+
+// CHECK-LABEL: quantize_conv_fn_per_tensor
+func.func @quantize_conv_fn_per_tensor(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> {
+  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+  %1 = "tf.XlaCallModule"(%0, %cst) {
+      Sout = [#tf_type.shape<1x3x4x2>],
+      dim_args_spec = [],
+      disabled_checks = [],
+      has_token_input_output = false,
+      module = "",
+      platforms = [],
+      version = 5 : i64,
+      _entry_function = @composite_conv_fn,
+      _original_entry_function = "composite_conv_fn",
+      _quantization_method = "static_range_ptq {}",
+      _stablehlo_module_attrs = {},
+      _tfl_quant_trait = "fully_quantizable",
+      device = ""
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+  return %2 : tensor<1x3x4x2xf32>
+}
+// Check that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32>
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+  %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+  return %0 : tensor<1x3x4x2xf32>
+}
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
 // Tests that fused pattern for convolution + bias is properly quantized.
 
 // Checks that fused functions with 1D bias is properly quantized.
@@ -246,7 +316,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<2xf32>} : () -> tensor<2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_1d_fn, _original_entry_function = "composite_conv_with_bias_1d_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_1d_fn,
+        _original_entry_function = "composite_conv_with_bias_1d_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -298,7 +383,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_fn, _original_entry_function = "composite_conv_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_fn,
+        _original_entry_function = "composite_conv_with_bias_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
     return %2 : tensor<1x3x4x2xf32>
   }
@@ -349,7 +449,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_dynamic_fn, _original_entry_function = "composite_conv_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
@@ -426,7 +541,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[0.00000000e-6, 8.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
@@ -506,7 +636,22 @@ module attributes {tf_saved_model.semantics} {
     %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
     %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
     %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {
+        Sout = [#tf_type.shape<1x3x4x2>],
+        _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn,
+        _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn",
+        _stablehlo_module_attrs = {},
+        // Per-channel quantization at dimension 3 for input index 1.
+        _quantization_method = "static_range_ptq {input_quantized_types {key: 1, value {dimension_specs {dimension: 3}}}}",
+        _tfl_quant_trait = "fully_quantizable",
+        device = "",
+        dim_args_spec = [],
+        disabled_checks = [],
+        has_token_input_output = false,
+        module = "",
+        platforms = [],
+        version = 5 : i64
+      } : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
     %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
     return %2 : tensor<?x3x4x2xf32>
   }
@@ -598,7 +743,7 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-// Tests that basic gather is properly quantized.
+// Tests that basic `stablehlo.gather` is properly quantized.
 
 module attributes {tf_saved_model.semantics} {
 // CHECK: func.func private @quantize_gather_fn(%[[ARG:.+]]: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"}
@@ -631,6 +776,5 @@ module attributes {tf_saved_model.semantics} {
     return %0 : tensor<2x3x2x2xf32>
   }
 // CHECK: %[[GATHER:.+]] = "stablehlo.gather"(%[[ARG_0]], %[[ARG_1]]) {{.*}} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
new file mode 100644
index 00000000000000..72851d92b64b75
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
@@ -0,0 +1,46 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions=enable-full-int-quantization=true | FileCheck --check-prefix=CHECK-FULL-INT %s
+
+// Tests that a basic `stablehlo.add` and a fused `stablehlo.dot_general`
+// are properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK-FULL-INT: func.func private @quantize_add_fn(%[[ARG:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_add_fn(%arg: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst_0 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<1x2xf32>} : () -> tensor<1x2xf32>
+    %cst_1 = "tf.Const"() {value = dense<1.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst_0) {Sout = [#tf_type.shape<1x2>], _entry_function = @composite_add_fn, _original_entry_function = "composite_add_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %3 = "quantfork.stats"(%2) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %4 = "tf.XlaCallModule"(%3, %cst_1) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %5 = "quantfork.stats"(%4) {layerStats = dense<[5.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %5 : tensor<1x3xf32>
+  }
+// CHECK-FULL-INT: %[[CONST:.+]] = stablehlo.constant() {value = dense<127> : tensor<1x2xi8>} : () -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>> 
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CALL:.+]] = call @quantized_add_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2xf32>
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[UNIFORM_DEQUANTIZE]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK-FULL-INT: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+// CHECK-FULL-INT: func.func private @quantized_add_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_add_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<1x2xf32>
+    return %0 : tensor<1x2xf32>
+  }
+// CHECK-FULL-INT: %[[ADD:.+]] = stablehlo.add %arg0, %arg1 : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: return %[[ADD]] : tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-FULL-INT: func.func private @quantized_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK-FULL-INT: %[[DOT_GENERAL:.+]] = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:1,{{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>
+// CHECK-FULL-INT: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL]] : (tensor<1x3x!quant.uniform<i32:f32:1, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-FULL-INT: return %[[UNIFORM_QUANTIZE]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir
deleted file mode 100644
index aa42045251778c..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_hybrid.mlir
+++ /dev/null
@@ -1,30 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
-// RUN:     -stablehlo-quantize-composite-functions=enable-weight-only=true | FileCheck --check-prefix=CHECK %s
-
-// Test that hybrid quantized dot_general op is produced when hybrid-quantize
-// is set to true.
-
-module attributes {tf_saved_model.semantics} {
-  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %1 : tensor<1x3xf32>
-  }
-
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-}
-
-// CHECK-LABEL: quantize_dot_general_fn
-// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
-// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
-// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
-// CHECK: return %[[CALL]]
-
-// CHECK: quantized_dot_general_fn
-// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
-// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
-// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
-// CHECK: return %[[DOT]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
new file mode 100644
index 00000000000000..dce15fe07760e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_weight_only.mlir
@@ -0,0 +1,60 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions=enable-weight-only=true | FileCheck --check-prefix=CHECK %s
+
+// Test that weight-only quantized dot_general op is produced when
+// enable-weight-only is set to true.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_dot_general_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_dot_general_fn(%[[ARG0]], %[[CST]]) : (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_dot_general_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x2xf32>,  %[[ARG2:.+]]: tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK: %[[DOT:.+]] = stablehlo.dot_general %[[ARG1]], %[[ARG2]]
+// CHECK-SAME: (tensor<1x2xf32>, tensor<2x3x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3xf32>
+// CHECK: return %[[DOT]]
+
+// -----
+
+// Test that hybrid quantized convolution op is produced when enable-weight-only
+// is set to true.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %0 = stablehlo.constant dense<3.000000e-01> : tensor<2x3x3x2xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %1 : tensor<1x3x4x2xf32>
+  }
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+}
+
+// CHECK-LABEL: quantize_conv_fn
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3x4x3xf32>
+// CHECK: %[[CST:.+]] = stablehlo.constant() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>
+// CHECK: %[[CALL:.+]] = call @quantized_conv_fn(%[[ARG0]], %[[CST]]) : (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CALL]]
+
+// CHECK: quantized_conv_fn
+// CHECK-SAME: (%[[ARG1:.+]]: tensor<1x3x4x3xf32>,  %[[ARG2:.+]]: tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[ARG1]], %[[ARG2]])
+// CHECK-SAME: (tensor<1x3x4x3xf32>, tensor<2x3x3x2x!quant.uniform<i8:f32, 0.0011764706349840352:-128>>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[CONV]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
new file mode 100644
index 00000000000000..831131a4c64555
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/pipelines/process_nchw_tensor.mlir
@@ -0,0 +1,171 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-process-nchw-tensor \
+// RUN:   -split-input-file -verify-diagnostics | FileCheck %s
+
+// Tests that a `convolution(%activation, %weight)` with the activation tensor
+// NCHW format is converted to NHWC convolution. Transpose ops are inserted to
+// the activation and output to match the function signature. The weight
+// constant is transposed.
+
+// CHECK-LABEL: nchw_conv
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x8x4x4xf32>
+func.func @nchw_conv(%arg0: tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32> {
+  %0 = stablehlo.constant() {value = dense<7.000000e+00> : tensor<8x8x3x3xf32>} : () -> tensor<8x8x3x3xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x8x4x4xf32>, tensor<8x8x3x3xf32>) -> tensor<1x8x4x4xf32>
+  return %2 : tensor<1x8x4x4xf32>
+}
+// CHECK-DAG: %[[CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x8x8xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x8x4x4xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x4x4x8xf32>, tensor<3x3x8x8xf32>) -> tensor<1x4x4x8xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `add(convolution(%activation, %weight), %bias)` with the
+// activation tensor of NCHW format is converted to NHWC convolution + add
+// operation. Transpose ops are inserted to activations and outputs to match the
+// function signature. Constants are also transposed accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  return %3 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[ADD]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `add(convolution(%activation, %weight), %bias)` pattern with the
+// activation tensor of NCHW format and non-constant bias is converted to NHWC
+// convolution, but without the deferred transpose for `stablehlo.add`.
+// Transpose ops are inserted to the activation and output of
+// `stablehlo.convolution`. The weight constants is transposed.
+
+// CHECK-LABEL: nchw_conv_with_nonconst_bias_add
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x2x5x5xf32>
+// CHECK-SAME: %[[ARG_1:.+]]: tensor<1x4x5x5xf32>
+func.func @nchw_conv_with_nonconst_bias_add(%arg0: tensor<1x2x5x5xf32>, %arg1: tensor<1x4x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %2 = stablehlo.add %1, %arg1 : tensor<1x4x5x5xf32>
+  return %2 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG_0]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[CONV]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[TRANSPOSE_1]], %[[ARG_1]] : tensor<1x4x5x5xf32>
+// CHECK: return %[[ADD]]
+
+// -----
+
+// Tests that a `reduce_window{max}(add(convolution(%activation, %weight), %bias), %init_value)`
+// with the activation tensor of NCHW format is converted to NHWC convolution +
+// add + reduce_window (with max) operation. Transpose ops are inserted to
+// activation and the final result to match the function signature. Constants
+// are also transposed accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add_max_pool
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add_max_pool(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x2x2xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %5 = stablehlo.constant dense<0xFF800000> : tensor<f32>  // -inf
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  %4 = "stablehlo.reduce_window"(%3, %5) ({  // max pool
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %6 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %6 : tensor<f32>
+  }) {
+    window_dimensions = array<i64: 1, 1, 2, 2>,
+    window_strides = array<i64: 1, 1, 2, 2>
+  } : (tensor<1x4x5x5xf32>, tensor<f32>) -> tensor<1x4x2x2xf32>
+  return %4 : tensor<1x4x2x2xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[INIT_VALUE_CONST:.+]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[REDUCE_WINDOW_MAX:.+]] = "stablehlo.reduce_window"(%[[ADD]], %[[INIT_VALUE_CONST:.+]])
+// CHECK: stablehlo.maximum
+// CHECK: {window_dimensions = array<i64: 1, 2, 2, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<1x5x5x4xf32>, tensor<f32>) -> tensor<1x2x2x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[REDUCE_WINDOW_MAX]], dims = [0, 3, 1, 2] : (tensor<1x2x2x4xf32>) -> tensor<1x4x2x2xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `maximum(add(convolution(%activation, %weight), %bias), %zero)`
+// with the activation tensor of NCHW format is converted to NHWC convolution +
+// add + maximum operation. Transpose ops are inserted to the activation and the
+// final output to match the function signature. Constants are also transpose-
+// folded accordingly.
+
+// CHECK-LABEL: nchw_conv_with_bias_add_relu
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_bias_add_relu(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>
+  %5 = stablehlo.constant dense<0.000000e+00> : tensor<1x4x5x5xf32>
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<1x4x5x5xf32>
+  %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x4x5x5xf32>
+  %4 = stablehlo.maximum %3, %5 : tensor<1x4x5x5xf32>
+  return %4 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[ZERO_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[BIAS_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[ADD]], %[[ZERO_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[MAX]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]
+
+// -----
+
+// Tests that a `maximum(add(convolution(%activation, %weight), broadcast(%bias)
+// ), %zero)` with the activation tensor of NCHW format is converted to NHWC
+// convolution + add + maximum operation. Transpose ops are inserted to the
+// first activation, final output, and the bias constant (after the broadcast),
+// to match the function signature. Constants are also transpose-folded
+// accordingly.
+//
+// Note that the `transpose` after the `broadcast_in_dim` is not folded by the
+// `FoldConstantTransposePass`.
+
+// CHECK-LABEL: nchw_conv_with_broadcasted_bias_add_relu
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x5x5xf32>
+func.func @nchw_conv_with_broadcasted_bias_add_relu(%arg0: tensor<1x2x5x5xf32>) -> tensor<1x4x5x5xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<4x2x3x3xf32>  // weight
+  %1 = stablehlo.constant dense<3.000000e+00> : tensor<4xf32>  // bias
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x4x5x5xf32>  // relu
+  %3 = stablehlo.broadcast_in_dim %1, dims = [1] : (tensor<4xf32>) -> tensor<1x4x5x5xf32>
+  %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x2x5x5xf32>, tensor<4x2x3x3xf32>) -> tensor<1x4x5x5xf32>
+  %5 = stablehlo.add %4, %3 : tensor<1x4x5x5xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x4x5x5xf32>
+  return %6 : tensor<1x4x5x5xf32>
+}
+// CHECK-DAG: %[[WEIGHT_CONST:.+]] = stablehlo.constant {{.*}} : tensor<3x3x2x4xf32>
+// CHECK-DAG: %[[ZERO_CONST:.+]] = stablehlo.constant {{.*}} : tensor<1x5x5x4xf32>
+// CHECK-DAG: %[[BIAS_CONST:.+]] = stablehlo.constant {{.*}} : tensor<4xf32>
+// CHECK-DAG: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %[[BIAS_CONST]], dims = [1] : (tensor<4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK-DAG: %[[TRANSPOSE_0:.+]] = stablehlo.transpose %[[ARG]], dims = [0, 2, 3, 1] : (tensor<1x2x5x5xf32>) -> tensor<1x5x5x2xf32>
+// CHECK: %[[CONV:.+]] = stablehlo.convolution(%[[TRANSPOSE_0]], %[[WEIGHT_CONST]]) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = {{\[\[}}1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2xf32>, tensor<3x3x2x4xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[BROADCAST_IN_DIM]], dims = [0, 2, 3, 1] : (tensor<1x4x5x5xf32>) -> tensor<1x5x5x4xf32>
+// CHECK: %[[ADD:.+]] = stablehlo.add %[[CONV]], %[[TRANSPOSE_1]] : tensor<1x5x5x4xf32>
+// CHECK: %[[MAX:.+]] = stablehlo.maximum %[[ADD]], %[[ZERO_CONST]] : tensor<1x5x5x4xf32>
+// CHECK: %[[TRANSPOSE_1:.+]] = stablehlo.transpose %[[MAX]], dims = [0, 3, 1, 2] : (tensor<1x5x5x4xf32>) -> tensor<1x4x5x5xf32>
+// CHECK: return %[[TRANSPOSE_1]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
index 69d9a725a37ebe..9b587e4273965f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tools/stablehlo_quant_opt.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "stablehlo/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"
@@ -53,6 +54,9 @@ int main(int argc, char** argv) {
   // These passes are only used for testing purposes.
   mlir::quant::stablehlo::testing::registerTestPasses();
 
+  // Register StableHLO Quantizer pass pipelines.
+  mlir::quant::stablehlo::RegisterPassPipelines();
+
   mlir::DialectRegistry registry;
   registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
                   mlir::tf_saved_model::TensorFlowSavedModelDialect,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
index a864ee556ff5af..fe6d62cc9731d1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
 
 #include <gtest/gtest.h>
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 
 namespace mlir::quant::stablehlo {
@@ -30,19 +32,25 @@ class StablehloTypeUtilsTest : public Test {
  protected:
   StablehloTypeUtilsTest() {
     ctx_.loadDialect<mlir::stablehlo::StablehloDialect,
-                     mlir::func::FuncDialect>();
+                     mlir::arith::ArithDialect, mlir::func::FuncDialect>();
   }
 
   MLIRContext ctx_;
   OpBuilder builder_{&ctx_};
 };
 
-TEST_F(StablehloTypeUtilsTest, ValidStablehloOpSucceeds) {
-  mlir::stablehlo::ConstantOp constant_op =
+TEST_F(StablehloTypeUtilsTest, IsStablehloOpSucceedsWithStablehloOp) {
+  const OwningOpRef<mlir::stablehlo::ConstantOp> constant_op =
       builder_.create<mlir::stablehlo::ConstantOp>(
           builder_.getUnknownLoc(), builder_.getI32IntegerAttr(0));
-  EXPECT_TRUE(IsStablehloOp(constant_op));
-  constant_op->erase();
+  EXPECT_TRUE(IsStablehloOp(*constant_op));
+}
+
+TEST_F(StablehloTypeUtilsTest, IsStablehloOpFailsWithArithOp) {
+  const OwningOpRef<mlir::arith::ConstantOp> constant_op =
+      builder_.create<mlir::arith::ConstantOp>(builder_.getUnknownLoc(),
+                                               builder_.getI32IntegerAttr(0));
+  EXPECT_FALSE(IsStablehloOp(*constant_op));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 6ef72d68c8ea83..be0792ab76aff3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -1,7 +1,7 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 # Placeholder: load py_proto_library
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist.bzl", "internal_visibility_allowlist")
@@ -301,6 +301,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -406,6 +407,7 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
         "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
+        "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib:quantization_config",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
@@ -442,9 +444,11 @@ cc_library(
         "//tensorflow/lite/kernels/internal:quantization_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_googlesource_code_re2//:re2",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 62a6f27c8ad5f1..23ce2105634854 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -183,6 +183,7 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
index 1734fa03aefe3e..de23418e1af031 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
@@ -85,8 +85,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/quantization/common/quantization_lib",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 8a9dc4eb3d4989..52ca3722a12bd5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
index 5d25779826e81c..bc6031eea7d85b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_quantize_op.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
index c86968b319c6dd..afeb8905855837 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/uniform_op_quant_spec.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir::quant {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
index 00ee53b84647eb..239fe32946ab87 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
@@ -182,7 +182,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
         rewriter.getNamedAttr("file_name", rewriter.getStringAttr(file_name)),
         // The op is disabled by default. Otherwise, values will be saved
         // during calibration.
-        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
+        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(enabled)),
         rewriter.getNamedAttr("func_name", rewriter.getStringAttr(func_name)),
         rewriter.getNamedAttr("node_name", rewriter.getStringAttr(node_name)),
     };
@@ -246,7 +246,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
     // Attach DumpTensorOp to its output layer.
     SmallVector<NamedAttribute> dump_attributes =
         CreateDumpAttributes(rewriter, folder_name, file_name,
-                             /*enabled=*/false, func_name, node_name);
+                             /*enabled=*/true, func_name, node_name);
     rewriter.create<TF::DumpTensorOp>(op->getLoc(), TypeRange{}, result,
                                       dump_attributes);
 
@@ -261,7 +261,7 @@ class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
       // Attach second DumpTensorOp to its output unquantized layer.
       SmallVector<NamedAttribute> dump_attributes = CreateDumpAttributes(
           rewriter, folder_name, /*file_name=*/"unquantized_tensor_data.pb",
-          /*enabled=*/false, func_name, node_name);
+          /*enabled=*/true, func_name, node_name);
       rewriter.create<TF::DumpTensorOp>(op.getLoc(), TypeRange{},
                                         new_op->getResult(0), dump_attributes);
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
index 5237102335e5df..8590a00775cdf0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/duplicate_shape_determining_constants.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <array>
 #include <iterator>
 #include <memory>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -26,9 +26,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 // Required to use LLVM_DEBUG macro.
 #define DEBUG_TYPE "quant-duplicate-shape-determining-constants"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index e518826d7e6d12..56b9d7393aacfd 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -32,6 +33,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
@@ -45,6 +47,7 @@ namespace quant {
 namespace {
 
 using ::stablehlo::quantization::CalibrationOptions;
+using ::stablehlo::quantization::Method;
 
 constexpr StringRef kQuantTraitAttrName = "_tfl_quant_trait";
 
@@ -199,7 +202,7 @@ class AddCustomAggregationOp : public RewritePattern {
 
     // The CustomAggregatorOp is only added after quantizable values.
     SmallVector<Value> quantizable_values;
-    if (isCallToLiftedFunction(op)) {
+    if (IsCallToQuantizableLiftedFunction(op)) {
       // Quantize inputs of quantizable composite functions.
       for (Value input : op->getOperands()) {
         Type element_type = getElementTypeOrSelf(input.getType());
@@ -226,7 +229,7 @@ class AddCustomAggregationOp : public RewritePattern {
       // Quantize output of fully quantizable composite functions.
       for (Value input : op->getOperands()) {
         auto defining_op = input.getDefiningOp();
-        if (!isCallToLiftedFunction(defining_op)) {
+        if (!IsCallToQuantizableLiftedFunction(defining_op)) {
           continue;
         }
 
@@ -282,9 +285,13 @@ class AddCustomAggregationOp : public RewritePattern {
   CalibrationOptions calib_opts_;
 
   // Whether the op is a call op to lifted composite function.
-  bool isCallToLiftedFunction(Operation *op) const {
+  bool IsCallToQuantizableLiftedFunction(Operation *op) const {
     if (!op) return false;
-    if (isa<TF::XlaCallModuleOp>(op)) return true;
+    if (auto xla_call_module_op = dyn_cast_or_null<TF::XlaCallModuleOp>(op);
+        xla_call_module_op != nullptr) {
+      absl::StatusOr<Method> method = GetQuantizationMethod(xla_call_module_op);
+      if (method.ok() && method->has_static_range_ptq()) return true;
+    }
 
     TF::PartitionedCallOp call_op = dyn_cast_or_null<TF::PartitionedCallOp>(op);
     return call_op && call_op->hasAttrOfType<StringAttr>(kQuantTraitAttrName) &&
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
index 20ffa5aa9b793c..47ab3b82fc2f24 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_quantized_functions.cc
@@ -19,13 +19,17 @@ limitations under the License.
 #include <tuple>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.h"
@@ -38,6 +42,7 @@ namespace quant {
 namespace {
 
 using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 class InsertQuantizedFunctionsPass
     : public PassWrapper<InsertQuantizedFunctionsPass,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
index 7c702957385bdf..f0274d26d942f0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_save_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
index 883d171fafca9f..63fb3bd94005ee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions.cc
@@ -54,6 +54,7 @@ namespace {
 
 using QuantizationUnit =
     ::tensorflow::quantization::UnitWiseQuantizationSpec::QuantizationUnit;
+using ::tensorflow::quantization::OpSet;
 using ::tensorflow::quantization::QuantizationComponentSpec;
 using ::tensorflow::quantization::QuantizationMethod;
 using ::tensorflow::quantization::QuantizationOptions;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
index f28399d7d2c0f8..0acb2e56ea617e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_quantizable_spots_as_functions_drq.cc
@@ -16,16 +16,16 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
@@ -39,6 +39,7 @@ namespace {
 
 using QuantMethod =
     ::tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 class LiftQuantizableSpotsAsFunctionsDRQPass
     : public PassWrapper<LiftQuantizableSpotsAsFunctionsDRQPass,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
index caef5c034f4b17..e092352dc52c29 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
index b459bbcd901125..9ff070e1a44e0d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/optimize.cc
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
-#include <string>
 #include <utility>
 
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"  // IWYU pragma: keep - required to use `IsSplatValueEqual`.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 97a383631e70db..5ea5a058cc94d3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <optional>
 #include <string>
 
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
@@ -49,14 +49,14 @@ CreateLiftQuantizableSpotsAsFunctionsPass(
 // Apply graph optimizations such as fusing and constant folding to prepare
 // lifting.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareLiftingPass(
-    OpSet target_opset);
+    tensorflow::quantization::OpSet target_opset);
 
 // Lifts the dynamic range quantizable spots as composite functions.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateLiftQuantizableSpotsAsFunctionsDRQPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet op_set, int min_num_elements_for_weights);
+    tensorflow::quantization::OpSet op_set, int min_num_elements_for_weights);
 
 // Replaces tf.CustomAggregator ops with quant.Stats ops for finalizing the
 // calibration procedure.
@@ -71,7 +71,7 @@ CreateIssueIDsOfCustomAggregationOpsPass();
 std::unique_ptr<OperationPass<ModuleOp>> CreateInsertQuantizedFunctionsPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet target_opset);
+    tensorflow::quantization::OpSet target_opset);
 
 // Inserts custom aggregation operators for the calibration procedure.
 std::unique_ptr<OperationPass<func::FuncOp>>
@@ -86,8 +86,9 @@ CreateInsertCustomAggregationOpsPass(
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
-    OpSet target_opset, bool enable_per_channel_quantization,
-    int min_num_elements_for_weight, bool enable_legacy_weight_only = false,
+    tensorflow::quantization::OpSet target_opset,
+    bool enable_per_channel_quantization, int min_num_elements_for_weights,
+    bool enable_legacy_weight_only = false,
     std::optional<const absl::string_view> mlir_dump_file_prefix =
         std::nullopt);
 
@@ -100,7 +101,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass();
 
 // Overloading of CreateQuantizePass which takes QuantizationSpecs.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizePass(
-    QuantizationSpecs quant_specs, OpSet target_opset);
+    QuantizationSpecs quant_specs,
+    tensorflow::quantization::OpSet target_opset);
 
 // Creates an instance of the PrepareQuantize pass, which will perform similar
 // transformations as TFL::PrepareQuantizePass.
@@ -112,12 +114,13 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
 // Creates an instance of the PrepareQuantizeDRQ pass, which will
 // perform similar transformations as TFL::PrepareQuantizeDynamicRangePass.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
-    const QuantizationSpecs& quant_specs, OpSet op_set);
+    const QuantizationSpecs& quant_specs,
+    tensorflow::quantization::OpSet op_set);
 
 // Creates an instance of the PreprocessOp pass, which will perform op
 // preprocessing to allow multi-axis quantization, prior to quantization.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
-    OpSet op_set,
+    tensorflow::quantization::OpSet op_set,
     tensorflow::quantization::QuantizationMethod::PresetMethod
         quantization_method,
     bool enable_per_channel_quantization);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index ebdd374288a065..38075bb67b7010 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -49,6 +49,8 @@ namespace mlir {
 namespace quant {
 namespace {
 
+using ::tensorflow::quantization::OpSet;
+
 class PrepareLiftingPass
     : public PassWrapper<PrepareLiftingPass, OperationPass<func::FuncOp>> {
  public:
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index 3a42967e6ada1b..fe38ed8dc0f634 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index af02c3694fc16d..71587390580406 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -45,7 +45,7 @@ namespace {
 
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
-using ::mlir::quant::OpSet;
+using ::tensorflow::quantization::OpSet;
 
 // Applies prepare quantization on the model in TF dialect for dynamic range
 // quantization case.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
index 765929a75043aa..3f54fe580fe1c4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -14,25 +14,36 @@ limitations under the License.
 ==============================================================================*/
 // This transformation pass applies quantization propagation on TF dialect.
 
-#include <algorithm>
+#include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
-#include <vector>
 
+#include "llvm/Support/CommandLine.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 //===----------------------------------------------------------------------===//
 // The preprocess-op Pass.
@@ -46,6 +57,7 @@ using QuantMethod =
     ::tensorflow::quantization::QuantizationMethod::PresetMethod;
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
+using ::tensorflow::quantization::OpSet;
 
 // Preprocesses ops to allow multi-axis quantization, prior to quantization
 // passes. Currently, per-channel quantization only supports 1D results.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
index ca088c5d318cf4..26e468556a36ab 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize.cc
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// Copied and modified from
-// //third_party/tensorflow/compiler/mlir/lite/transforms/quantize.cc
-// This transformation pass applies quantization on TF dialect.
 #include <memory>
 #include <string>
 #include <utility>
@@ -44,7 +41,6 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
@@ -60,6 +56,8 @@ namespace quant {
 //===----------------------------------------------------------------------===//
 namespace {
 
+using ::tensorflow::quantization::OpSet;
+
 enum QuantizationTrait { kFullQuantization, kDynamicRangeQuantization };
 
 // Base struct for quantization.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 2ddb9f50eedee4..0b3c89c56f60bb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -62,6 +62,7 @@ namespace quant {
 namespace {
 
 using QuantMethod = tensorflow::quantization::QuantizationMethod::PresetMethod;
+using ::tensorflow::quantization::OpSet;
 
 constexpr absl::string_view kQuantizeCompositeFunctionsStepName =
     "_quantize_composite_functions";
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index eb91ce68063308..a7a56a610bec41 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -32,15 +32,20 @@ cc_library(
         "//tensorflow/python:__pkg__",
     ],
     deps = [
+        ":py_function_lib",
         ":unfreeze_constants",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:post_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pre_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_export",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:saved_model_import",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:types",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -55,7 +60,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
-        "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -83,6 +87,7 @@ cc_library(
     hdrs = ["quantize_model.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":py_function_lib",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -169,11 +174,9 @@ cc_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibration_statistics_proto_cc",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "@com_google_absl//absl/container:flat_hash_map",
-        #"@com_google_absl//absl/strings:string_view",
-        "@pybind11",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -232,9 +235,6 @@ tf_python_pybind_extension(
         ":py_function_lib",
         ":quantize_model_cc",
         ":type_casters",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:debugger",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration:statistics",
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index a28d7ebe4bf7f3..ec86deac1b497d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -83,7 +83,6 @@
     'UniformQuantizedDotHybrid',
 )
 
-_DebuggerOptions = quant_opts_pb2.DebuggerOptions
 _DebuggerConfig = stablehlo_quant_config_pb2.DebuggerConfig
 
 # Lists of ops whose channel dimension should be changed if per_channel
@@ -1179,8 +1178,12 @@ def test_qat_gather_and_conv_model(
         quantization_options,
     )
     self.assertIsNotNone(converted_model)
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 0.5
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.5,
     )
 
   def test_qat_vocab_table_lookup_model(self):
@@ -2017,15 +2020,22 @@ def test_gather_and_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.68
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.68,
       )
       self.assertTrue(
           self._contains_op(output_graphdef, 'UniformQuantizedConvolution')
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
       if target_opset == quant_opts_pb2.XLA:
         self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
@@ -2976,12 +2986,19 @@ def test_gather_model(
     )
 
     if expect_quantized_gather:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
     else:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 2 / 3
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          2 / 3,
       )
 
   @test_util.run_in_graph_and_eager_modes
@@ -3578,7 +3595,9 @@ def test_ptq_multiple_signatures_invalid_dataset_raises_value_error(self):
         for _ in range(8)
     ]
 
-    with self.assertRaisesRegex(ValueError, 'Invalid representative dataset.'):
+    with self.assertRaisesRegex(
+        Exception, 'Representative dataset is not a mapping'
+    ):
       quantize_model.quantize(
           self._input_saved_model_path,
           output_directory=self._output_saved_model_path,
@@ -3933,8 +3952,8 @@ def test_ptq_model_with_tf1_saved_model_invalid_input_key_raises_value_error(
     )
 
     with self.assertRaisesRegex(
-        ValueError,
-        'Failed to run graph for post-training quantization calibration',
+        Exception,
+        'Invalid input keys for representative sample.',
     ):
       quantize_model.quantize(
           self._input_saved_model_path,
@@ -4877,12 +4896,19 @@ def test_gather_model(
     )
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.65
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.65,
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
 
   @parameterized.named_parameters(
@@ -4931,8 +4957,11 @@ def test_gather_and_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, 0.65
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          0.65,
       )
       self.assertTrue(
           self._contains_op(
@@ -4940,8 +4969,12 @@ def test_gather_and_conv_model(
           )
       )
     else:
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+      # Due to other meta data, the compression is not exactly 1/4.
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          1 / 3,
       )
       if target_opset == quant_opts_pb2.XLA:
         self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
@@ -5097,14 +5130,20 @@ def test_gather_model_tf1(
 
     if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
       threshold = 0.45 if use_variable else 0.7
-      self.assertSizeRatioGreaterThan(
-          self._output_saved_model_path, self._input_saved_model_path, threshold
+      self.assertGreater(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          threshold,
       )
 
     else:
       threshold = 0.19 if use_variable else 0.42
-      self.assertSizeRatioLessThan(
-          self._output_saved_model_path, self._input_saved_model_path, threshold
+      self.assertLess(
+          testing.get_size_ratio(
+              self._output_saved_model_path, self._input_saved_model_path
+          ),
+          threshold,
       )
 
   @test_util.run_in_graph_and_eager_modes
@@ -5358,10 +5397,11 @@ def test_einsum_model(
         )
     )
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.5,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.5,
     )
 
   @parameterized.named_parameters(
@@ -5409,10 +5449,11 @@ def test_matmul_model(
 
     # Due to other meta data, the compression is not exactly 1/4.
     self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.3,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
   @parameterized.named_parameters(
@@ -5469,10 +5510,11 @@ def test_conv_model(
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=0.3,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
     if enable_per_channel_quantization:
@@ -5561,10 +5603,11 @@ def test_depthwise_conv2d_model(
 
     # Due to other meta data, the compression is not exactly 1/4.
     size_threshold = 0.5 if enable_per_channel_quantization else 0.32
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path,
-        self._input_saved_model_path,
-        threshold=size_threshold,
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        size_threshold,
     )
 
     if enable_per_channel_quantization:
@@ -5659,8 +5702,12 @@ def test_gather_model(
     self.assertCountEqual(
         converted_model.signatures._signatures.keys(), {'serving_default'}
     )
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 0.3
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        0.3,
     )
 
   @parameterized.named_parameters(
@@ -5720,8 +5767,12 @@ def test_gather_and_conv_model(
     )
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
-    self.assertSizeRatioLessThan(
-        self._output_saved_model_path, self._input_saved_model_path, 1 / 3
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertLess(
+        testing.get_size_ratio(
+            self._output_saved_model_path, self._input_saved_model_path
+        ),
+        1 / 3,
     )
 
   @test_util.run_in_graph_and_eager_modes
@@ -5926,7 +5977,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         ),
         op_set=quant_opts_pb2.XLA,
-        debugger_options=_DebuggerOptions(
+        debugger_config=_DebuggerConfig(
             debugger_type=_DebuggerConfig.DebuggerType.DEBUGGER_TYPE_WHOLE_MODEL,
             unquantized_dump_model_path=unquantized_dump_model_path,
             log_dir_path=log_dir_path,
@@ -6039,7 +6090,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
             preset_method=_PresetMethod.METHOD_STATIC_RANGE_INT8
         ),
         op_set=target_opset,
-        debugger_options=_DebuggerOptions(
+        debugger_config=_DebuggerConfig(
             debugger_type=debugger_type,
             log_dir_path=log_dir_path,
         ),
@@ -6880,8 +6931,11 @@ def test_selective_quantization_on_gather(
     # The Conv2D op shouldn't be quantized as it has no FakeQuant on input.
     self.assertTrue(self._contains_op(graphdef, 'Conv2D'))
     # If the Gather op is quantized, input_model_size / output_model_size > 2.
-    self.assertSizeRatioLessThan(
-        self._input_saved_model_path, self._output_saved_model_path, 1.15
+    self.assertLess(
+        testing.get_size_ratio(
+            self._input_saved_model_path, self._output_saved_model_path
+        ),
+        1.15,
     )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index 1aad9b619b61d6..245240e5ebb1be 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -108,40 +108,6 @@ def _any_log_contains(
         )
     )
 
-  def assertSizeRatioGreaterThan(
-      self, path_a: str, path_b: str, threshold: float
-  ):
-    """Check if the size ratio of the given paths is greater than the threshold.
-
-    Args:
-      path_a: Path of a directory or a file to be the nominator of the ratio.
-      path_b: Path of a directory or a file to be the denominator of the ratio.
-      threshold: a number to compare with.
-
-    Returns:
-      True if the size ratio of path_a / path_b is greater than threshold.
-    """
-    size_a = self._get_dir_size(path_a)
-    size_b = self._get_dir_size(path_b)
-    size_ratio = size_a / size_b
-    return self.assertGreater(size_ratio, threshold)
-
-  def assertSizeRatioLessThan(self, path_a: str, path_b: str, threshold: float):
-    """Check if the size ratio of the given paths is less than the threshold.
-
-    Args:
-      path_a: Path of a directory or a file to be the nominator of the ratio.
-      path_b: Path of a directory or a file to be the denominator of the ratio.
-      threshold: a number to compare with.
-
-    Returns:
-      True if the size ratio of path_a / path_b is less than threshold.
-    """
-    size_a = self._get_dir_size(path_a)
-    size_b = self._get_dir_size(path_b)
-    size_ratio = size_a / size_b
-    return self.assertLess(size_ratio, threshold)
-
   def _is_quantized_function(self, func: function_pb2.FunctionDef) -> bool:
     """Determine whether a FunctionDef is quantized.
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
index 8273279df67787..a0865c44664290 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.cc
@@ -30,9 +30,6 @@ limitations under the License.
 #include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
 #include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
 #include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h"
@@ -45,17 +42,13 @@ namespace py = pybind11;
 
 namespace {
 
-using ::stablehlo::quantization::AddCalibrationStatistics;
-using ::stablehlo::quantization::EnableDebugging;
-using ::stablehlo::quantization::io::CreateTmpDir;
 using ::tensorflow::SignatureDef;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PyFunctionLibrary;
 using ::tensorflow::quantization::QuantizationOptions;
-using ::tensorflow::quantization::QuantizePtqDynamicRange;
-using ::tensorflow::quantization::QuantizePtqModelPostCalibration;
-using ::tensorflow::quantization::QuantizePtqModelPreCalibration;
+using ::tensorflow::quantization::QuantizeDynamicRangePtq;
 using ::tensorflow::quantization::QuantizeQatModel;
+using ::tensorflow::quantization::QuantizeStaticRangePtq;
 using ::tensorflow::quantization::QuantizeWeightOnly;
 using ::tensorflow::quantization::RepresentativeDatasetFile;
 
@@ -89,7 +82,7 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
@@ -132,13 +125,13 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
                     quantization_options.tags().end());
 
         const absl::StatusOr<ExportedModel> exported_model =
-            QuantizePtqDynamicRange(src_saved_model_path, signature_keys, tags,
+            QuantizeDynamicRangePtq(src_saved_model_path, signature_keys, tags,
                                     quantization_options);
 
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
@@ -222,73 +215,22 @@ PYBIND11_MODULE(pywrap_quantize_model, m) {
         std::unordered_set<std::string> tags;
         tags.insert(quantization_options.tags().begin(),
                     quantization_options.tags().end());
-
-        absl::StatusOr<ExportedModel> exported_model =
-            QuantizePtqModelPreCalibration(src_saved_model_path, signature_keys,
-                                           tags, quantization_options);
+        const absl::StatusOr<ExportedModel> exported_model =
+            QuantizeStaticRangePtq(src_saved_model_path, signature_keys, tags,
+                                   quantization_options, signature_def_map,
+                                   py_function_library,
+                                   representative_dataset_file_map_serialized);
         if (!exported_model.ok()) return exported_model.status();
 
-        const absl::StatusOr<std::string> precalibrated_saved_model_dir =
-            CreateTmpDir();
-        if (!precalibrated_saved_model_dir.ok()) {
-          throw py::value_error(
-              precalibrated_saved_model_dir.status().ToString());
-        }
-
-        py_function_library.SaveExportedModel(
-            *precalibrated_saved_model_dir, *exported_model,
-            src_saved_model_path, tags, signature_def_map);
-
-        py_function_library.RunCalibration(
-            *precalibrated_saved_model_dir, signature_keys, tags,
-            quantization_options.calibration_options(),
-            quantization_options.force_graph_mode_calibration(),
-            representative_dataset_file_map_serialized);
-
-        if (absl::Status status = AddCalibrationStatistics(
-                *exported_model->mutable_graph_def(),
-                quantization_options.calibration_options(),
-                py_function_library);
-            !status.ok()) {
-          LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
-                          "values. Parts of the graph are not quantized. "
-                       << status;
-        }
-
-        if (quantization_options.has_debugger_options()) {
-          EnableDebugging(*exported_model,
-                          quantization_options.debugger_options(),
-                          py_function_library, src_saved_model_path, tags,
-                          signature_def_map);
-        }
-
-        const absl::StatusOr<std::string> calibrated_saved_model_path =
-            CreateTmpDir();
-        if (!calibrated_saved_model_path.ok()) {
-          throw py::value_error(
-              calibrated_saved_model_path.status().ToString());
-        }
-
-        py_function_library.SaveExportedModel(
-            *calibrated_saved_model_path, *exported_model, src_saved_model_path,
-            tags, signature_def_map);
-
-        const absl::StatusOr<ExportedModel> post_calibrated_exported_model =
-            QuantizePtqModelPostCalibration(*calibrated_saved_model_path,
-                                            signature_keys, tags,
-                                            quantization_options);
-        if (!post_calibrated_exported_model.ok())
-          return post_calibrated_exported_model.status();
-
         // Remove the `tpu` tag from the debug quantized saved model as it is
         // for CPU. Note the 'tpu' value should be the same as `TPU` defined in
         // tensorflow/python/saved_model/tag_constants.py.
-        if (quantization_options.has_debugger_options()) {
+        if (quantization_options.has_debugger_config()) {
           tags.erase("tpu");
         }
         py_function_library.SaveExportedModel(
-            dst_saved_model_path, *post_calibrated_exported_model,
-            *calibrated_saved_model_path, tags, signature_def_map);
+            dst_saved_model_path, *exported_model, src_saved_model_path, tags,
+            signature_def_map);
 
         return absl::OkStatus();
       },
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 08b71190bbb5b5..89467d30944ca9 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -34,18 +34,23 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/debugger.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/unfreeze_constants.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
@@ -54,7 +59,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tsl/platform/errors.h"
@@ -64,7 +68,6 @@ namespace tensorflow {
 namespace quantization {
 namespace {
 
-using ::mlir::quant::stablehlo::AddExportPasses;
 using ::mlir::quant::stablehlo::ConvertMlirModuleToExportedModel;
 using ::mlir::quant::stablehlo::CreateMlirContextForQuantization;
 using ::mlir::quant::stablehlo::ExportOptions;
@@ -75,21 +78,17 @@ using ::mlir::quant::stablehlo::kExportStepSuffix;
 using ::mlir::quant::stablehlo::PostCalibrationComponent;
 using ::mlir::quant::stablehlo::PreCalibrationComponent;
 using ::mlir::quant::stablehlo::UpdateFunctionAliases;
+using ::stablehlo::quantization::AddCalibrationStatistics;
+using ::stablehlo::quantization::ChangeToQuantizedFilename;
 using ::stablehlo::quantization::DebuggerConfig;
+using ::stablehlo::quantization::DisableDebugging;
+using ::stablehlo::quantization::EnableDebugging;
+using ::stablehlo::quantization::ExpandPresets;
+using ::stablehlo::quantization::PopulateDefaults;
 using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
-
-// TODO: b/326355110 - Removes `ConvertDebuggerOptionToDebuggerConfig` when
-// merging `DebuggingOption` to `DebuggingConfig`.
-DebuggerConfig ConvertDebuggerOptionToDebuggerConfig(
-    const DebuggerOptions &debugger_options) {
-  DebuggerConfig debugger_config;
-  debugger_config.set_debugger_type(debugger_options.debugger_type());
-  debugger_config.set_unquantized_dump_model_path(
-      debugger_options.unquantized_dump_model_path());
-  debugger_config.set_log_dir_path(debugger_options.log_dir_path());
-  return debugger_config;
-}
+using ::tensorflow::quantization::PyFunctionLibrary;
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportAndPreprocessSavedModel(
     absl::string_view saved_model_path,
@@ -135,41 +134,6 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportAndPreprocessSavedModel(
   return module_ref;
 }
 
-// Sets up and runs the passes for exporting `module_op`. The behavior of the
-// exporting passes is controlled by `export_opts`. Returns `AssetFileDef`s that
-// associate the input arguments of @main and the asset file names. Asset file
-// names will be used to feed the corresponding tensors during initialization
-// upon model loading.
-absl::StatusOr<llvm::SmallVector<AssetFileDef>> RunExportPasses(
-    const ExportOptions &export_opts, mlir::MLIRContext &ctx,
-    mlir::ModuleOp module_op) {
-  if (export_opts.unfreeze_constants) {
-    TF_RETURN_IF_ERROR(UnfreezeConstantsAndSaveVariables(
-        export_opts.checkpoint_dir, ctx, module_op));
-    LOG(INFO) << "Unfrozen constants and saved variables to checkpoint file: "
-              << export_opts.checkpoint_dir;
-  }
-
-  if (absl::Status pass_run_status = RunPasses(
-          /*name=*/
-          export_opts.debug_name,
-          /*add_passes_func=*/
-          [dup_constants = export_opts.duplicate_shape_determining_constants](
-              mlir::PassManager &pm) { AddExportPasses(pm, dup_constants); },
-          ctx, module_op);
-      !pass_run_status.ok()) {
-    return pass_run_status;
-  }
-
-  mlir::FailureOr<llvm::SmallVector<AssetFileDef>> asset_file_defs =
-      mlir::quant::ConvertAssetArgs(module_op);
-  if (failed(asset_file_defs)) {
-    return absl::InternalError("Failed to convert asset args.");
-  }
-
-  return *asset_file_defs;
-}
-
 absl::StatusOr<ExportedModel> ModuleOpToExportedModel(
     mlir::ModuleOp module_op, mlir::MLIRContext *ctx,
     absl::string_view step_name, const bool unfreeze_constants,
@@ -189,90 +153,63 @@ absl::StatusOr<ExportedModel> ModuleOpToExportedModel(
       {asset_file_defs.begin(), asset_file_defs.end()});
 }
 
-}  // namespace
-
-absl::StatusOr<ExportedModel> QuantizeQatModel(
-    const absl::string_view saved_model_path,
-    const std::vector<std::string> &signature_keys,
-    const std::unordered_set<std::string> &tags,
-    const QuantizationOptions &quantization_options) {
-  // Convert the SavedModelBundle to an MLIR module.
-  std::unique_ptr<mlir::MLIRContext> context =
-      CreateMlirContextForQuantization();
+absl::StatusOr<ExportedModel> ExportCalibrationModel(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  // Clone ModuleOp and function aliases so changes in this pipeline won't
+  // be reflected in the original values.
+  mlir::OwningOpRef<mlir::ModuleOp> cloned_module_ref(module_op.clone());
 
-  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
-      function_aliases = GetFunctionAliases(saved_model_path, tags);
-  if (!function_aliases.ok()) {
-    return absl::InternalError(absl::StrCat(
-        "Failed to get function alias: ", function_aliases.status().message()));
-  }
+  // Disable DumpTensor ops when running calibration.
+  DisableDebugging(*cloned_module_ref);
 
-  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      ImportAndPreprocessSavedModel(
-          saved_model_path, signature_keys, tags, context.get(),
-          /*is_inliner_run=*/true,
-          /*run_tf_to_stablehlo=*/false,
-          /*deserialize_xla_call_module=*/false, *function_aliases);
-  if (!module.status().ok()) {
+  absl::StatusOr<ExportedModel> exported_model = ModuleOpToExportedModel(
+      *cloned_module_ref, context, kTfQuantPtqPreCalibrationStepName,
+      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
+      function_aliases);
+  if (!exported_model.status().ok()) {
     return absl::InternalError(
-        absl::StrCat("Failed to import and preprocess SavedModel: ",
-                     module.status().message()));
+        absl::StrCat("Failed to export calibration model: ",
+                     exported_model.status().message()));
   }
-  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
-
-  TF_RETURN_IF_ERROR(RunPasses(
-      /*name=*/
-      kTfQuantQatStepName, /*add_passes_func=*/
-      [&quantization_options](mlir::PassManager &pm) {
-        AddQuantizeQatPasses(pm, quantization_options, kTfQuantQatStepName);
-      },
-      *context, *module_ref));
 
-  return ModuleOpToExportedModel(
-      *module_ref, context.get(), kTfQuantQatStepName,
-      /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
-      *function_aliases);
+  return *exported_model;
 }
 
-absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
-    const absl::string_view saved_model_path,
-    const std::vector<std::string> &signature_keys,
-    const std::unordered_set<std::string> &tags,
+QuantizationConfig GetQuantizationConfigForStaticRangePtq(
     const QuantizationOptions &quantization_options) {
-  // Convert the SavedModelBundle to an MLIR module.
-  std::unique_ptr<mlir::MLIRContext> context =
-      CreateMlirContextForQuantization();
-
-  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
-      function_aliases = GetFunctionAliases(saved_model_path, tags);
-  if (!function_aliases.ok()) {
-    return absl::InternalError(absl::StrCat(
-        "Failed to get function alias: ", function_aliases.status().message()));
-  }
+  QuantizationConfig quantization_config{};
+  // TODO: b/331302857 - Remove `enable_per_channel_quantized_weight` usage.
+  quantization_config.mutable_static_range_ptq_preset()
+      ->set_enable_per_channel_quantized_weight(
+          quantization_options.enable_per_channel_quantization());
+  // When targeting server TPUs quantized types should be unpacked into
+  // integer ops.
+  quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
+      true);
+  *quantization_config.mutable_debugger_config() =
+      quantization_options.debugger_config();
+  quantization_config.mutable_static_range_ptq_preset();
+  *quantization_config.mutable_calibration_options() =
+      quantization_options.calibration_options();
+
+  return ExpandPresets(PopulateDefaults(quantization_config));
+}
 
+absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibrationImpl(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
   const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
-  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
-      ImportAndPreprocessSavedModel(
-          saved_model_path, signature_keys, tags, context.get(),
-          /*is_inliner_run=*/true,
-          /*run_tf_to_stablehlo=*/is_stablehlo,
-          /*deserialize_xla_call_module=*/false, *function_aliases);
-  if (!module.status().ok()) {
-    return absl::InternalError(
-        absl::StrCat("Failed to import and preprocess SavedModel: ",
-                     module.status().message()));
-  }
-  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
-
   // Use StableHLO Quantizer option if opset is specified.
   if (is_stablehlo) {
-    QuantizationConfig quantization_config;
-    *quantization_config.mutable_debugger_config() =
-        ConvertDebuggerOptionToDebuggerConfig(
-            quantization_options.debugger_options());
-    PreCalibrationComponent pre_calibration_component(context.get());
-    TF_ASSIGN_OR_RETURN(*module_ref, pre_calibration_component.Run(
-                                         *module_ref, quantization_config));
+    const QuantizationConfig quantization_config =
+        GetQuantizationConfigForStaticRangePtq(quantization_options);
+
+    PreCalibrationComponent pre_calibration_component(context);
+    TF_ASSIGN_OR_RETURN(module_op, pre_calibration_component.Run(
+                                       module_op, quantization_config));
   } else {
     TF_RETURN_IF_ERROR(RunPasses(
         /*name=*/
@@ -280,17 +217,47 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
         [&quantization_options](mlir::PassManager &pm) {
           AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
         },
-        *context, *module_ref));
+        *context, module_op));
+  }
+
+  return ExportCalibrationModel(module_op, context, quantization_options,
+                                function_aliases);
+}
+
+absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibrationImpl(
+    mlir::ModuleOp module_op, mlir::MLIRContext *context,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, std::string> &function_aliases) {
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
+  // Use StableHLO Quantizer option if opset is specified.
+  if (is_stablehlo) {
+    const QuantizationConfig quantization_config =
+        GetQuantizationConfigForStaticRangePtq(quantization_options);
+
+    PostCalibrationComponent post_calibration_component(context);
+    TF_ASSIGN_OR_RETURN(module_op, post_calibration_component.Run(
+                                       module_op, quantization_config));
+  } else {
+    TF_RETURN_IF_ERROR(RunPasses(
+        /*name=*/
+        kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
+        [&quantization_options](mlir::PassManager &pm) {
+          AddQuantizePtqPostCalibrationPasses(
+              pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
+        },
+        *context, module_op));
   }
 
   return ModuleOpToExportedModel(
-      *module_ref, context.get(), kTfQuantPtqPreCalibrationStepName,
+      module_op, context, kTfQuantPtqPostCalibrationStepName,
       /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
-      *function_aliases);
+      function_aliases);
 }
 
-absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    const absl::string_view saved_model_path,
+}  // namespace
+
+absl::StatusOr<ExportedModel> QuantizeQatModel(
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
@@ -304,16 +271,12 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
         "Failed to get function alias: ", function_aliases.status().message()));
   }
 
-  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
-  // Freezing is required again since variables might have been produced during
-  // the pre-calibration step. `is_inliner_run = false` to prevent the functions
-  // lifted for quantization from being inlined.
   absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
       ImportAndPreprocessSavedModel(
           saved_model_path, signature_keys, tags, context.get(),
-          /*is_inliner_run=*/false,
+          /*is_inliner_run=*/true,
           /*run_tf_to_stablehlo=*/false,
-          /*deserialize_xla_call_module=*/is_stablehlo, *function_aliases);
+          /*deserialize_xla_call_module=*/false, *function_aliases);
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
@@ -321,39 +284,22 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
   }
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
-  // Use StableHLO Quantizer option if opset is specified.
-  if (is_stablehlo) {
-    QuantizationConfig quantization_config{};
-    quantization_config.mutable_static_range_ptq_preset()
-        ->set_enable_per_channel_quantized_weight(
-            quantization_options.enable_per_channel_quantization());
-    // When targeting server TPUs quantized types should be unpacked into
-    // integer ops.
-    quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
-        true);
-
-    PostCalibrationComponent post_calibration_component(context.get());
-    TF_ASSIGN_OR_RETURN(*module_ref, post_calibration_component.Run(
-                                         *module_ref, quantization_config));
-  } else {
-    TF_RETURN_IF_ERROR(RunPasses(
-        /*name=*/
-        kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
-        [&quantization_options](mlir::PassManager &pm) {
-          AddQuantizePtqPostCalibrationPasses(
-              pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
-        },
-        *context, *module_ref));
-  }
+  TF_RETURN_IF_ERROR(RunPasses(
+      /*name=*/
+      kTfQuantQatStepName, /*add_passes_func=*/
+      [&quantization_options](mlir::PassManager &pm) {
+        AddQuantizeQatPasses(pm, quantization_options, kTfQuantQatStepName);
+      },
+      *context, *module_ref));
 
   return ModuleOpToExportedModel(
-      *module_ref, context.get(), kTfQuantPtqPostCalibrationStepName,
+      *module_ref, context.get(), kTfQuantQatStepName,
       /*unfreeze_constants=*/!quantization_options.freeze_all_variables(),
       *function_aliases);
 }
 
-absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
-    const absl::string_view saved_model_path,
+absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
+    absl::string_view saved_model_path,
     const std::vector<std::string> &signature_keys,
     const std::unordered_set<std::string> &tags,
     const QuantizationOptions &quantization_options) {
@@ -373,13 +319,11 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
           /*is_inliner_run=*/true,
           /*run_tf_to_stablehlo=*/false, /*deserialize_xla_call_module=*/false,
           *function_aliases);
-
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
                      module.status().message()));
   }
-
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
   TF_RETURN_IF_ERROR(RunPasses(
@@ -400,7 +344,7 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
 // TODO: b/297626257 - [Converter Component][TF-Quantizer] Clean up
 // quantize_model.cc by factoring out repeated codes
 absl::StatusOr<ExportedModel> QuantizeWeightOnly(
-    const absl::string_view saved_model_path,
+    absl::string_view saved_model_path,
     const QuantizationOptions &quantization_options) {
   std::unique_ptr<mlir::MLIRContext> context =
       CreateMlirContextForQuantization();
@@ -423,13 +367,11 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
            quantization_options.tags().end()},
           context.get(), /*is_inliner_run=*/true, /*run_tf_to_stablehlo=*/false,
           /*deserialize_xla_call_module=*/false, *function_aliases);
-
   if (!module.status().ok()) {
     return absl::InternalError(
         absl::StrCat("Failed to import and preprocess SavedModel: ",
                      module.status().message()));
   }
-
   mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
 
   TF_RETURN_IF_ERROR(RunPasses(
@@ -447,5 +389,90 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
       *function_aliases);
 }
 
+absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
+    absl::string_view saved_model_path,
+    const std::vector<std::string> &signature_keys,
+    const std::unordered_set<std::string> &tags,
+    const QuantizationOptions &quantization_options,
+    const absl::flat_hash_map<std::string, SignatureDef> &signature_def_map,
+    const PyFunctionLibrary &py_function_library,
+    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>
+        &representative_dataset_file_map_serialized) {
+  std::unique_ptr<mlir::MLIRContext> context =
+      CreateMlirContextForQuantization();
+
+  absl::StatusOr<absl::flat_hash_map<FunctionName, FunctionAlias>>
+      function_aliases = GetFunctionAliases(saved_model_path, tags);
+  if (!function_aliases.ok()) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to get function alias: ", function_aliases.status().message()));
+  }
+
+  const bool is_stablehlo = quantization_options.op_set() == OpSet::STABLEHLO;
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module =
+      ImportAndPreprocessSavedModel(
+          saved_model_path, signature_keys, tags, context.get(),
+          /*is_inliner_run=*/true,
+          /*run_tf_to_stablehlo=*/is_stablehlo,
+          /*deserialize_xla_call_module=*/false, *function_aliases);
+  if (!module.status().ok()) {
+    return absl::InternalError(
+        absl::StrCat("Failed to import and preprocess SavedModel: ",
+                     module.status().message()));
+  }
+  mlir::OwningOpRef<mlir::ModuleOp> module_ref = std::move(module).value();
+
+  TF_ASSIGN_OR_RETURN(
+      absl::StatusOr<ExportedModel> pre_calibration_exported_model,
+      QuantizePtqModelPreCalibrationImpl(
+          *module_ref, context.get(), quantization_options, *function_aliases));
+
+  TF_ASSIGN_OR_RETURN(
+      const absl::StatusOr<std::string> precalibrated_saved_model_dir,
+      CreateTmpDir());
+
+  py_function_library.SaveExportedModel(
+      *precalibrated_saved_model_dir, *pre_calibration_exported_model,
+      saved_model_path, tags, signature_def_map);
+
+  py_function_library.RunCalibration(
+      *precalibrated_saved_model_dir, signature_keys, tags,
+      quantization_options.calibration_options(),
+      quantization_options.force_graph_mode_calibration(),
+      representative_dataset_file_map_serialized);
+
+  if (absl::Status status = AddCalibrationStatistics(
+          *module_ref, quantization_options.calibration_options(),
+          py_function_library);
+      !status.ok()) {
+    LOG(WARNING) << "Some CustomAggregator ops do not have min or max "
+                    "values. Parts of the graph are not quantized. "
+                 << status;
+  }
+
+  // Saves the current model to the `unquantized_dump_model_path` if the
+  // debugger type is `DEBUGGER_TYPE_WHOLE_MODEL`. This is required
+  // because in whole-model debugging mode the `DumpTensor` ops for the
+  // unquantized tensors are only inserted in the unquantized model
+  // whereas `DumpTensor` ops for the quantized tensors are only inserted
+  // in the quantized model. Both models are required to be able to dump
+  // both quantized and unquantized tensors and compare them offline.
+  if (quantization_options.has_debugger_config() &&
+      quantization_options.debugger_config().debugger_type() ==
+          DebuggerConfig::DEBUGGER_TYPE_WHOLE_MODEL) {
+    EnableDebugging(*pre_calibration_exported_model);
+    ChangeToQuantizedFilename(*module_ref);
+
+    absl::string_view unquantized_dump_model_path =
+        quantization_options.debugger_config().unquantized_dump_model_path();
+    py_function_library.SaveExportedModel(
+        unquantized_dump_model_path, *pre_calibration_exported_model,
+        saved_model_path, tags, signature_def_map);
+  }
+
+  return QuantizePtqModelPostCalibrationImpl(
+      *module_ref, context.get(), quantization_options, *function_aliases);
+}
+
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index 556086ce018123..a54e988c043aa3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/exported_model.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace tensorflow {
@@ -46,28 +47,28 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     const std::unordered_set<std::string>& tags,
     const QuantizationOptions& quantization_options);
 
-// Apply post-training dynamic range quantization to the model.
-absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
+// Applies post-training dynamic-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeDynamicRangePtq(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
     const QuantizationOptions& quantization_options);
 
+// Applies post-training static-range weight-only quantization to the model.
 absl::StatusOr<ExportedModel> QuantizeWeightOnly(
     absl::string_view saved_model_path,
     const QuantizationOptions& quantization_options);
 
-absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
+// Applies post-training static-range quantization to the model.
+absl::StatusOr<ExportedModel> QuantizeStaticRangePtq(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quantization_options);
-
-absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
-    absl::string_view saved_model_path,
-    const std::vector<std::string>& signature_keys,
-    const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quantization_options);
+    const QuantizationOptions& quantization_options,
+    const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
+    const PyFunctionLibrary& py_function_library,
+    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
+        representative_dataset_file_map_serialized);
 
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 1bf3fe81c7d8ba..e0eeca13d92f20 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -59,6 +59,8 @@
 _QuantizationComponent = _QuantizationComponentSpec.QuantizationComponent
 _TensorType = _QuantizationComponentSpec.TensorType
 
+_RepresentativeDatasetFile = quant_opts_pb2.RepresentativeDatasetFile
+
 # Mapping of signature def key -> SignatureDef.
 _SignatureDefMap = Mapping[str, meta_graph_pb2.SignatureDef]
 
@@ -99,6 +101,57 @@ def _serialize_signature_def_map(
   return signature_def_map_serialized
 
 
+def _save_representative_dataset(
+    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
+    signature_def_map: _SignatureDefMap,
+) -> Mapping[str, _RepresentativeDatasetFile]:
+  """Saves the representative dataset to temporary TFRecord files.
+
+  Args:
+    representative_dataset: Representative dataset used for the calibration
+      step. Representative datasets should exist for each signature def key in
+      `signature_def_keys`.
+    signature_def_map: Signature def key -> SignatureDef mapping.
+
+  Returns:
+    A map from signature key to the saved representative dataset file.
+  """
+  if isinstance(representative_dataset, Mapping):
+    if set(signature_def_map.keys()) != set(representative_dataset.keys()):
+      raise ValueError(
+          'The signature keys and the keys of representative dataset map '
+          f'do not match. Signature keys: {set(signature_def_map.keys())}, '
+          f'representative dataset map: {set(representative_dataset.keys())}.'
+      )
+    representative_dataset_map = representative_dataset
+  elif len(signature_def_map.keys()) > 1:
+    raise ValueError(
+        'Representative dataset is not a mapping (got: '
+        f'{type(representative_dataset)}), but there is more than one '
+        'signature key provided. Please provide a map of '
+        '{signature_key -> dataset} with more than one signature key.'
+    )
+  else:
+    representative_dataset_map = {
+        list(signature_def_map.keys())[0]: representative_dataset,
+    }
+
+  # Save the representative dataset to temporary TFRecord files.
+  path_map = {}
+  expected_input_key_map = {}
+  for signature_key, signature_def in signature_def_map.items():
+    # Filepath is the second return value of mkstemp.
+    _, path_map[signature_key] = tempfile.mkstemp(
+        suffix='.tfrecord', prefix=signature_key
+    )
+    expected_input_key_map[signature_key] = signature_def.inputs.keys()
+
+  return repr_dataset.TfRecordRepresentativeDatasetSaver(
+      path_map=path_map,
+      expected_input_key_map=expected_input_key_map,
+  ).save(representative_dataset_map)
+
+
 def _run_static_range_qat(
     src_saved_model_path: str,
     dst_saved_model_path: str,
@@ -133,7 +186,7 @@ def _run_static_range_ptq(
     src_saved_model_path: str,
     dst_saved_model_path: str,
     quant_opts: _QuantizationOptions,
-    representative_dataset: repr_dataset.RepresentativeDatasetOrMapping,
+    representative_dataset: Mapping[str, _RepresentativeDatasetFile],
     signature_def_map: _SignatureDefMap,
 ) -> None:
   """Runs static-range Post-Training Quantization.
@@ -147,9 +200,8 @@ def _run_static_range_ptq(
     src_saved_model_path: Path to the source SavedModel directory.
     dst_saved_model_path: Path to the destination SavedModel directory.
     quant_opts: Quantization options.
-    representative_dataset: Representative dataset used for the calibration
-      step. Representative datasets should exist for each signature def key in
-      `signature_def_keys`.
+    representative_dataset: A map from signature key to the saved representative
+      dataset file.
     signature_def_map: Signature def key -> SignatureDef mapping.
 
   Raises:
@@ -159,29 +211,11 @@ def _run_static_range_ptq(
 
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
 
-  if isinstance(representative_dataset, Mapping):
-    representative_dataset_map = representative_dataset
-  else:
-    representative_dataset_map = {
-        list(signature_def_map.keys())[0]: representative_dataset,
-    }
-
-  # Save the representative dataset to temporary TFRecord files.
-  path_map = {}
-  for signature_key in representative_dataset_map.keys():
-    path_map[signature_key] = tempfile.mkstemp(
-        suffix='.tfrecord', prefix=signature_key
-    )[1]  # Filepath.
-
-  dataset_file_map = repr_dataset.TfRecordRepresentativeDatasetSaver(
-      path_map
-  ).save(representative_dataset_map)
-
   # `quantize_ptq_static_range` requires `RepresentativeDatasetFile`s to be
   # serialized. Serialize the values to match the type.
   dataset_file_map_serialized = {
       signature_key: dataset_file.SerializeToString()
-      for signature_key, dataset_file in dataset_file_map.items()
+      for signature_key, dataset_file in representative_dataset.items()
   }
   pywrap_quantize_model.quantize_ptq_static_range(
       src_saved_model_path,
@@ -246,9 +280,24 @@ def _static_range_quantize(
       set(quantization_options.tags),
   )
 
+  if (
+      representative_dataset is not None
+      and quantization_options.representative_datasets
+  ):
+    raise ValueError(
+        'Do not specify both the `representative_dataset` argument and'
+        ' the `representative_datasets` field in `QuantizationOptions`.'
+    )
+
+  saved_representative_dataset = quantization_options.representative_datasets
+  if representative_dataset is not None:
+    saved_representative_dataset = _save_representative_dataset(
+        representative_dataset, signature_def_map
+    )
+
   # Checks if the model is from QAT or method is METHOD_NO_QUANTIZE.
   if (
-      representative_dataset is None
+      not saved_representative_dataset
       and not is_qat_saved_model_or_method_no_quantize
   ):
     raise ValueError(
@@ -274,7 +323,7 @@ def _static_range_quantize(
         src_saved_model_path,
         dst_saved_model_path,
         quantization_options,
-        representative_dataset,
+        saved_representative_dataset,
         signature_def_map,
     )
 
@@ -692,7 +741,7 @@ def _populate_quantization_options_default_values(
         ' quantization via TF Quantizer.'
     )
 
-  if quantization_options.HasField('debugger_options'):
+  if quantization_options.HasField('debugger_config'):
     # Set `force_graph_mode_calibration` to True to avoid skipping op execution,
     # which are not connected to return ops, during calibration execution.
     # Setting `force_graph_mode_calibration` to True enables execution of the
@@ -704,11 +753,11 @@ def _populate_quantization_options_default_values(
     )
     quantization_options.force_graph_mode_calibration = True
 
-    if not quantization_options.debugger_options.log_dir_path:
-      quantization_options.debugger_options.log_dir_path = '/tmp/dumps'
+    if not quantization_options.debugger_config.log_dir_path:
+      quantization_options.debugger_config.log_dir_path = '/tmp/dumps'
 
     if (
-        quantization_options.debugger_options.debugger_type
+        quantization_options.debugger_config.debugger_type
         == stablehlo_quant_config_pb2.DebuggerConfig.DebuggerType.DEBUGGER_TYPE_UNSPECIFIED
     ):
       raise ValueError(
@@ -716,9 +765,9 @@ def _populate_quantization_options_default_values(
       )
 
     if (
-        quantization_options.debugger_options.debugger_type
+        quantization_options.debugger_config.debugger_type
         == stablehlo_quant_config_pb2.DebuggerConfig.DebuggerType.DEBUGGER_TYPE_WHOLE_MODEL
-        and not quantization_options.debugger_options.unquantized_dump_model_path
+        and not quantization_options.debugger_config.unquantized_dump_model_path
     ):
       raise ValueError(
           'Debugger type whole model verify was used but'
@@ -840,20 +889,6 @@ def quantize(
 
   _populate_quantization_options_default_values(quantization_options)
 
-  if (
-      representative_dataset is not None
-      and quantization_options.representative_datasets
-  ):
-    raise ValueError(
-        'Do not specify both the `representative_dataset` argument and'
-        ' the `representative_datasets` field in `QuantizationOptions`.'
-    )
-
-  if quantization_options.representative_datasets:
-    representative_dataset = repr_dataset.TfRecordRepresentativeDatasetLoader(
-        quantization_options.representative_datasets
-    ).load()
-
   method: _QuantizationMethod = quantization_options.quantization_method
   if (
       method.preset_method == _PresetMethod.METHOD_STATIC_RANGE_INT8
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
index fabda2ebad3397..c18358745866b4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/representative_dataset.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Defines types required for representative datasets for quantization."""
 
-import collections.abc
+from collections.abc import Collection, Sized
 import os
 from typing import Iterable, Mapping, Optional, Union
 
@@ -117,7 +117,11 @@ class TfRecordRepresentativeDatasetSaver(RepresentativeDatasetSaver):
   ```
   """
 
-  def __init__(self, path_map: Mapping[str, os.PathLike[str]]):
+  def __init__(
+      self,
+      path_map: Mapping[str, os.PathLike[str]],
+      expected_input_key_map: Optional[Mapping[str, Collection[str]]] = None,
+  ):
     """Initializes TFRecord represenatative dataset saver.
 
     Args:
@@ -125,8 +129,22 @@ def __init__(self, path_map: Mapping[str, os.PathLike[str]]):
         to which a `RepresentativeDataset` is saved. The signature def keys
         should be a subset of the `SignatureDef` keys of the
         `representative_dataset` argument of the `save()` call.
+      expected_input_key_map: Signature def key -> expected input keys. If set,
+        validate that the sample has same set of input keys before saving.
+
+    Raises:
+      KeyError: If path_map and expected_input_key_map have different keys.
     """
     self.path_map: Mapping[str, os.PathLike[str]] = path_map
+    self.expected_input_key_map: Mapping[str, Collection[str]] = {}
+    if expected_input_key_map is not None:
+      if set(path_map.keys()) != set(expected_input_key_map.keys()):
+        raise KeyError(
+            'The `path_map` and `expected_input_key_map` should have the same'
+            ' set of keys.'
+        )
+
+      self.expected_input_key_map = expected_input_key_map
 
   def _save_tf_record_dataset(
       self,
@@ -143,6 +161,10 @@ def _save_tf_record_dataset(
 
     Returns:
       a RepresentativeDatasetFile instance contains the path to the saved file.
+
+    Raises:
+      KeyError: If the set of input keys in the dataset samples doesn't match
+      the set of expected input keys.
     """
     # When running in graph mode (TF1), tf.Tensor types should be converted to
     # numpy ndarray types to be compatible with `make_tensor_proto`.
@@ -150,9 +172,23 @@ def _save_tf_record_dataset(
       with session.Session() as sess:
         repr_ds = replace_tensors_by_numpy_ndarrays(repr_ds, sess)
 
+    expected_input_keys = self.expected_input_key_map.get(
+        signature_def_key, None
+    )
     tfrecord_file_path = self.path_map[signature_def_key]
     with python_io.TFRecordWriter(tfrecord_file_path) as writer:
       for repr_sample in repr_ds:
+        if (
+            expected_input_keys is not None
+            and set(repr_sample.keys()) != expected_input_keys
+        ):
+          raise KeyError(
+              'Invalid input keys for representative sample. The function'
+              f' expects input keys of: {set(expected_input_keys)}. Got:'
+              f' {set(repr_sample.keys())}. Please provide correct input keys'
+              ' for representative samples.'
+          )
+
         sample = _RepresentativeDataSample()
         for input_name, input_value in repr_sample.items():
           sample.tensor_proto_inputs[input_name].CopyFrom(
@@ -317,7 +353,7 @@ def get_num_samples(repr_ds: RepresentativeDataset) -> Optional[int]:
     is malformed; it simply means the size cannot be determined without
     iterating the whole dataset.
   """
-  if isinstance(repr_ds, collections.abc.Sized):
+  if isinstance(repr_ds, Sized):
     try:
       return len(repr_ds)
     except Exception as ex:  # pylint: disable=broad-except
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 13d3876500fe0d..d2c79b6ce4c668 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -145,21 +145,6 @@ message RepresentativeDatasetFile {
   }
 }
 
-// Configuration for quantization debugger.
-// NEXT ID: 4
-message DebuggerOptions {
-  // Type of quantization debugger. Depending on the type, inputs and outputs
-  // are wired differently.
-  stablehlo.quantization.DebuggerConfig.DebuggerType debugger_type = 1;
-
-  // Path to save unquantized model with dump tensor ops attached.
-  // Used when debugger_type is WHOLE_MODEL.
-  string unquantized_dump_model_path = 2;
-
-  // Path to save debugger related logs. Defaults to '/tmp/dumps'.
-  string log_dir_path = 3;
-}
-
 // Defines various options to specify and control the behavior of the quantizer.
 // It consists of
 // 1) Model-wise quantization configuration as a default configuration. If it is
@@ -251,7 +236,7 @@ message QuantizationOptions {
   stablehlo.quantization.CalibrationOptions calibration_options = 15;
 
   // Configuration related to quantization debugger.
-  DebuggerOptions debugger_options = 16;
+  stablehlo.quantization.DebuggerConfig debugger_config = 16;
 
   reserved 3;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
index 0d5e43cd6f334e..0e756021844a5c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.cc
@@ -149,10 +149,10 @@ void AddQuantizePtqPreCalibrationPasses(
   pm.addPass(mlir::quant::CreateLiftQuantizableSpotsAsFunctionsPass(
       quantization_options));
   // TODO: b/295140328 - Add debugger support for weight only
-  if (quantization_options.has_debugger_options()) {
+  if (quantization_options.has_debugger_config()) {
     pm.addPass(mlir::quant::CreateAddDumpTensorOpPass(
-        quantization_options.debugger_options().debugger_type(),
-        quantization_options.debugger_options().log_dir_path()));
+        quantization_options.debugger_config().debugger_type(),
+        quantization_options.debugger_config().log_dir_path()));
   }
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::quant::CreateInsertCustomAggregationOpsPass(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
index fe12e5935a8791..d50f28941f4269 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op.mlir
@@ -29,7 +29,7 @@ module {
 // WholeModel-DAG: %[[b:.*]] = "tf.Const"() <{value = dense<[-2.000000e+00, 3.000000e+00
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}>
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // WholeModel-DAG: return %[[output0]], %[[output1]]
 
 // IntPerLayer-LABEL: func @conv
@@ -38,8 +38,8 @@ module {
 // IntPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %cst, %cst_0) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // IntPerLayer-DAG: return %[[output0]], %[[output1_quantized]]
 
 // FloatPerLayer-LABEL: func @conv
@@ -48,8 +48,8 @@ module {
 // FloatPerLayer-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}
 // FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w]], %[[b]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "conv", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}> : (tensor<*xf32>) -> ()
 // FloatPerLayer-DAG: return %[[output0]], %[[output1_unquantized]]
 }
 
@@ -86,9 +86,9 @@ module {
 // WholeModel-DAG: %[[w0:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}0.193340182, 0.285152316
 // WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // WholeModel-DAG: %[[output0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// WholeModel-DAG: "tf.DumpTensor"(%[[output0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
 // WholeModel-DAG: %[[output1:.*]] = "tf.PartitionedCall"(%[[output0]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// WholeModel-DAG: "tf.DumpTensor"(%[[output1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // WholeModel-DAG: return %[[output1]]
 
 // IntPerLayer-LABEL: func @multiple_conv2d
@@ -98,12 +98,12 @@ module {
 // IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // IntPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}>
 // IntPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_quantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}>
 // IntPerLayer-DAG: return %[[output1_quantized]]
 
 // FloatPerLayer-LABEL: func @multiple_conv2d
@@ -113,12 +113,12 @@ module {
 // FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[\[\[}}-0.174680978, -0.367524445
 // FloatPerLayer-DAG: %[[output0_quantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output0_unquantized:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]], %[[b0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_2_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output0_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_2", node_name = "Conv2D"}
 // FloatPerLayer-DAG: %[[output1_quantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[output1_unquantized:.*]] = "tf.PartitionedCall"(%[[output0_unquantized]], %[[w1]], %[[b1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_conv2d_with_bias_and_relu6_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_quantized]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[output1_unquantized]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "multiple_conv2d", log_dir_path = "/tmp/dumps/composite_conv2d_with_bias_and_relu6_fn_1", node_name = "Conv2D_1"}
 // FloatPerLayer-DAG: return %[[output1_unquantized]]
 }
 
@@ -146,8 +146,8 @@ module {
 // WholeModel-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: return %[[m1]]
 
 // IntPerLayer-LABEL: func @matmul2
@@ -155,12 +155,12 @@ module {
 // IntPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: return %[[m1]] : tensor<2x2xf32>
 
 // FloatPerLayer-LABEL: func @matmul2
@@ -168,12 +168,12 @@ module {
 // FloatPerLayer-DAG: %[[w1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893
 // FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: return %[[m1_0]] : tensor<2x2xf32>
 }
 
@@ -203,8 +203,8 @@ module {
 // WholeModel-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
 // WholeModel-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: return %[[pc_1]]
 
 // IntPerLayer-LABEL: func @matmul2_softmax
@@ -212,13 +212,13 @@ module {
 // IntPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
 // IntPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
 // IntPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_0]]) {T = "tfdtype$DT_FLOAT"}
 // IntPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // IntPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // IntPerLayer-DAG: return %[[pc_2]]
 
 // FloatPerLayer-LABEL: func @matmul2_softmax
@@ -226,13 +226,13 @@ module {
 // FloatPerLayer-DAG: %[[cst_1:.*]] = "tf.Const"() <{value = dense<{{\[\[}}-0.211145893, -0.708605706
 // FloatPerLayer-DAG: %[[pc_0:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[pc_1:.*]] = "tf.PartitionedCall"(%arg0, %[[cst_0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
 // FloatPerLayer-DAG: %[[sm_0:.*]] = "tf.Softmax"(%[[pc_1]]) {T = "tfdtype$DT_FLOAT"}
 // FloatPerLayer-DAG: %[[pc_2:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
 // FloatPerLayer-DAG: %[[pc_3:.*]] = "tf.PartitionedCall"(%[[sm_0]], %[[cst_1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_2]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[pc_3]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_softmax", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // FloatPerLayer-DAG: return %[[pc_3]]
 }
 
@@ -263,8 +263,8 @@ module {
 // WholeModel-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}
 // WholeModel-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"}
 // WholeModel-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
-// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}
+// WholeModel-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}
 // WholeModel-DAG: %[[c:.*]] = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]])
 // WholeModel-DAG: return %[[c]]
 
@@ -274,12 +274,12 @@ module {
 // IntPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
 // IntPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // IntPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // IntPerLayer-DAG: %4 = "tf.ConcatV2"(%[[m0]], %[[m1]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
 // IntPerLayer-DAG: return %4 : tensor<2x4xf32>
 
@@ -289,12 +289,12 @@ module {
 // FloatPerLayer-DAG: %[[axis:.*]] = "tf.Const"() <{value = dense<-1> : tensor<i32>}> : () -> tensor<i32>
 // FloatPerLayer-DAG: %[[m0:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m0_1:.*]] = "tf.PartitionedCall"(%arg0, %[[w0]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_2_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m0_1]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_2", node_name = "MatMul"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[m1:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1}> {_tfl_quant_trait = "fully_quantizable"} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
 // FloatPerLayer-DAG: %[[m1_0:.*]] = "tf.PartitionedCall"(%[[m0_1]], %[[w1]]) <{config = "", config_proto = "", executor_type = "", f = @composite_matmul_fn_1_0}> : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[m1_0]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "matmul2_concat", log_dir_path = "/tmp/dumps/composite_matmul_fn_1", node_name = "MatMul_1"}> : (tensor<2x2xf32>) -> ()
 // FloatPerLayer-DAG: %4 = "tf.ConcatV2"(%1, %[[m1_0]], %[[axis]]) : (tensor<2x2xf32>, tensor<2x2xf32>, tensor<i32>) -> tensor<2x4xf32>
 // FloatPerLayer-DAG: return %4 : tensor<2x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
index 357a6119fa8b0f..bdb9b320109597 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
@@ -35,9 +35,9 @@ module {
 // WholeModel-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // WholeModel-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
 // WholeModel-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // WholeModel-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // WholeModel-DAG: return %[[matmul1_q]] : tensor<?x2xf32>
 // WholeModel-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // WholeModel-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -46,13 +46,13 @@ module {
 // IntPerLayer-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul1_uq:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // IntPerLayer-DAG: return %[[matmul1_q]] : tensor<?x2xf32>
 // IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -63,13 +63,13 @@ module {
 // FloatPerLayer-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
 // FloatPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // FloatPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_uq]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul1_uq:.*]] = "tf.XlaCallModule"(%[[matmul0_uq]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
 // FloatPerLayer-DAG: return %[[matmul1_uq]] : tensor<?x2xf32>
 // FloatPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
 // FloatPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
@@ -96,7 +96,7 @@ module {
 // WholeModel-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // WholeModel-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // WholeModel-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// WholeModel-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // WholeModel-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_q]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // WholeModel-DAG: return %[[concat]] : tensor<2x3xf32>
 // WholeModel-DAG: func.func private @composite_dot_general_fn_1
@@ -105,9 +105,9 @@ module {
 // IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // IntPerLayer-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1_0, _original_entry_function = "composite_dot_general_fn_1_0", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // IntPerLayer-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_q]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // IntPerLayer-DAG: return %[[concat]] : tensor<2x3xf32>
 // IntPerLayer-DAG: func.func private @composite_dot_general_fn_1
@@ -117,9 +117,9 @@ module {
 // FloatPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
 // FloatPerLayer-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
 // FloatPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = true, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // FloatPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1_0, _original_entry_function = "composite_dot_general_fn_1_0", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// FloatPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = true, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
 // FloatPerLayer-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_uq]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
 // FloatPerLayer-DAG: return %[[concat]] : tensor<2x3xf32>
 // FloatPerLayer-DAG: func.func private @composite_dot_general_fn_1
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
index 81cdce725460fb..5d5342e8a264c4 100644
--- a/tensorflow/compiler/mlir/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index c1c8966849e4b9..26d5e4d52b41d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,3 +1,5 @@
+# buildifier: disable=out-of-order-load
+
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # copybara:uncomment_begin(google-only)
@@ -354,12 +356,15 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -400,12 +405,15 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -447,12 +455,15 @@ cc_library(
         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -521,7 +532,6 @@ cc_library(
         "ir/tf_saved_model.h",
         "ir/tf_structs.h",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.h",
-        "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     includes = ["include"],
     visibility = ["//visibility:public"],
@@ -549,6 +559,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfacesIncGen",
         "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:ControlFlowInterfaces",
@@ -558,6 +569,7 @@ cc_library(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Parser",
@@ -672,6 +684,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -1366,6 +1379,9 @@ cc_library(
     deps = [
         ":tensorflow",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -1598,6 +1614,7 @@ cc_library(
         "tensorflow_side_effects",
         "tensorflow_types",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
index d0a05e45617cf6..5ceda80490f688 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h"
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
 #include "mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -55,9 +54,6 @@ ResourceConstructingOps ResourceConstructingOps::EntryState(Value value) {
           tf_saved_model::GlobalTensorOp>(func, barg.getArgNumber(),
                                           symbol_table);
       ResourceConstructingOps result(global_tensor);
-      if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
-        result.is_on_composite_device = true;
-      }
       return result;
     }
   } else if (auto vh = dyn_cast<TF::VarHandleOp>(value.getDefiningOp())) {
@@ -75,17 +71,47 @@ ResourceConstructingOps ResourceConstructingOps::join(
   ResourceConstructingOps ret;
   ret.ops.insert(lhs.ops.begin(), lhs.ops.end());
   ret.ops.insert(rhs.ops.begin(), rhs.ops.end());
-  ret.is_on_composite_device =
-      lhs.is_on_composite_device || rhs.is_on_composite_device;
   return ret;
 }
 
 void ResourceConstructingOps::print(raw_ostream &os) const {
   llvm::interleaveComma(ops, os << "[");
+  os << "]";
+}
+
+IsComposite::IsComposite(Operation *op) {}
+
+IsComposite IsComposite::EntryState(MLIRContext *context) {
+  return IsComposite();
+}
+
+IsComposite IsComposite::EntryState(Value value) {
+  IsComposite result;
+  if (auto barg = value.dyn_cast<BlockArgument>()) {
+    if (func::FuncOp func =
+            dyn_cast<func::FuncOp>(barg.getOwner()->getParentOp())) {
+      if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
+        result.is_on_composite_device = true;
+      }
+      return result;
+    }
+  }
+  return result;
+}
+
+IsComposite IsComposite::join(const IsComposite &lhs, const IsComposite &rhs) {
+  IsComposite ret;
+  ret.is_on_composite_device =
+      lhs.is_on_composite_device || rhs.is_on_composite_device;
+  return ret;
+}
+
+void IsComposite::print(raw_ostream &os) const {
   if (is_on_composite_device) {
-    os << " COMPOSITE";
+    os << "COMPOSITE";
+  } else {
+    os << "NOT_COMPOSITE";
   }
-  os << "]";
 }
 
 class ResourceDataflowAnalysis
@@ -94,23 +120,32 @@ class ResourceDataflowAnalysis
   using TensorflowDataflowAnalysis<
       ResourceConstructingOps>::TensorflowDataflowAnalysis;
   void visitOperation(Operation *op, ArrayRef<const StateT *> operands,
-                      ArrayRef<StateT *> results) override;
+                      ArrayRef<StateT *> results) override {
+    if (ForwardThroughTFOperation(op, operands, results)) return;
+    setAllToEntryStates(results);
+  }
   ~ResourceDataflowAnalysis() override = default;
 };
 
-void ResourceDataflowAnalysis::visitOperation(Operation *op,
-                                              ArrayRef<const StateT *> operands,
-                                              ArrayRef<StateT *> results) {
-  LLVM_DEBUG(llvm::dbgs() << "ResAn: Visiting operation: " << *op << "\n");
-
-  if (ForwardThroughTFOperation(op, operands, results)) return;
-
-  setAllToEntryStates(results);
-}
+class IsCompositeDataflowAnalysis
+    : public TensorflowDataflowAnalysis<IsComposite> {
+ public:
+  using TensorflowDataflowAnalysis<IsComposite>::TensorflowDataflowAnalysis;
+  void visitOperation(Operation *op, ArrayRef<const StateT *> operands,
+                      ArrayRef<StateT *> results) override {
+    if (ForwardThroughTFOperation(op, operands, results)) return;
+    setAllToEntryStates(results);
+  }
+  ~IsCompositeDataflowAnalysis() override = default;
+};
 
 void LoadResourceDataflowAnalysis(DataFlowSolver &solver) {
   solver.load<ResourceDataflowAnalysis>();
 }
 
+void LoadIsCompositeDataflowAnalysis(DataFlowSolver &solver) {
+  solver.load<IsCompositeDataflowAnalysis>();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
index 9015b9dc739634..0cf3611af1d20c 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
@@ -46,8 +46,7 @@ struct ResourceConstructingOps {
   static ResourceConstructingOps EntryState(MLIRContext *context);
   static ResourceConstructingOps EntryState(Value value);
   bool operator==(const ResourceConstructingOps &rhs) const {
-    return ops == rhs.ops &&
-           is_on_composite_device == rhs.is_on_composite_device;
+    return ops == rhs.ops;
   }
 
   static ResourceConstructingOps join(const ResourceConstructingOps &lhs,
@@ -57,13 +56,27 @@ struct ResourceConstructingOps {
   // The operation(s) which created the resource value.
   // IR constructs (i.e., GlobalTensorOp) are not const-correct.
   mutable DenseSet<Operation *> ops;
+};
+
+struct IsComposite {
+  explicit IsComposite(Operation *op = nullptr);
+  static IsComposite EntryState(MLIRContext *context);
+  static IsComposite EntryState(Value value);
+  bool operator==(const IsComposite &rhs) const {
+    return is_on_composite_device == rhs.is_on_composite_device;
+  }
+
+  static IsComposite join(const IsComposite &lhs, const IsComposite &rhs);
+  void print(raw_ostream &os) const;
 
   bool is_on_composite_device = false;
 };
 
 typedef dataflow::Lattice<ResourceConstructingOps> ResourceDataflowState;
+typedef dataflow::Lattice<IsComposite> IsCompositeDataflowState;
 
 void LoadResourceDataflowAnalysis(DataFlowSolver &solver);
+void LoadIsCompositeDataflowAnalysis(DataFlowSolver &solver);
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
index ddef04d4185e1d..ccf7b0b547ab90 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/ir/host_runtime/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:resource_handle",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
index a6ecd2d9fbe91f..35336a005eba0c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 55b68e5de2fb5f..db28242944434e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -2196,6 +2196,73 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
+// Tests inputs to TPUComputation that are tiled in multiple dimensions with
+// replicate_on_last_tile_dim set.
+
+// The following OpSharding is used for TPU computation inputs in below test:
+// Proto debug string:
+//  input 0
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   tile_assignment_devices: 2
+//   tile_assignment_devices: 3
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//  "\08\03\1A\03\02\01\02\22\04\00\01\02\030\01"
+//
+// input 1
+//  type: MAXIMAL
+//  tile_assignment_dimensions: 1
+//  tile_assignment_devices: 1
+// Serialized string:
+//  "\08\01\1A\01\01\22\01\01"
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @multi_dimension_tiled_input_replicate_last_dim
+  // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<128x10xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<128x10xf32>, %[[ARG_2:[a-z0-9]*]]: tensor<*xi32>, %[[ARG_3:[a-z0-9]*]]: tensor<*xi32>)
+  func.func @multi_dimension_tiled_input_replicate_last_dim(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
+    // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
+    %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf._TPUCompileMlir"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      %[[CONST_SPLIT_0_DIM:.*]] = "tf.Const"()
+      // CHECK:      %[[SPLIT_0_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_0_DIM]], %[[RI_0]])
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#0, %[[COMPILE]]#1)
+      // CHECK:          tf_device.return %[[EXECUTE_0_OUTPUT]]
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#0, %[[RI_1]], %[[COMPILE]]#2)
+      // CHECK:          tf_device.return %[[EXECUTE_1_OUTPUT]]
+      // CHECK:        %[[LAUNCH_2_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_2_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#1, %[[COMPILE]]#3)
+      // CHECK:          tf_device.return %[[EXECUTE_2_OUTPUT]]
+      // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_0_OUT]]#1, %[[COMPILE]]#4)
+      // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\03\1A\03\02\01\02\22\04\00\01\02\030\01", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""], use_spmd_for_xla_partitioning = false} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
+    }
+    func.return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
+  }
+  func.func @tpu0_func(%arg0: tensor<128x10xf32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    %1, %2 = "tf.A"(%arg0) : (tensor<128x10xf32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %4 = "tf.B"(%1, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>)
+    %3 = "tf.XlaSharding"(%2) { _XlaSharding = "", sharding = "" } : (tensor<*xi1>) -> tensor<*xi1>
+    func.return %4, %3 : tensor<*xi32>, tensor<*xi1>
+  }
+}
+
+// -----
+
 // Tests that tiled output with multiple dimension sharding works properly.
 
 // The following OpSharding is used for TPU computation outputs in below test:
@@ -2278,6 +2345,73 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
+// Tests that tiled output with multiple dimension sharding works properly with
+// replicate_on_last_tile_dim set.
+
+// The following OpSharding is used for TPU computation outputs in below test:
+// output 0
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 2
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//   tile_assignment_devices: 2
+//   tile_assignment_devices: 3
+//   replicate_on_last_tile_dim: true
+// Serialized string:
+//  "\08\03\1A\03\02\01\02\22\04\00\01\02\030\01"
+//
+// output 1
+//  Proto debug string:
+//  type: MAXIMAL
+//  tile_assignment_dimensions: 1
+//  tile_assignment_devices: 0
+// Serialized string:
+//  "\08\01\1A\01\01\22\01\00"
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
+  // CHECK-LABEL: func @multi_dimension_tiled_output_replicate_last_dim
+  func.func @multi_dimension_tiled_output_replicate_last_dim(%arg0: tensor<128x10xf32>, %arg1: tensor<128x10xf32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    // CHECK: tf_device.replicate
+    // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
+    // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
+    %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf._TPUCompileMlir"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
+      // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
+      // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
+      // CHECK:          tf_device.return %[[EXECUTE_0_OUTPUT]]
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
+      // CHECK:          tf_device.return %[[EXECUTE_1_OUTPUT]]
+      // CHECK:        %[[LAUNCH_2_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_2_OUTPUT:[0-9]*]] = "tf.TPUExecute"(
+      // CHECK:          tf_device.return %[[EXECUTE_2_OUTPUT]]
+      // CHECK:        %[[LAUNCH_3_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:     %[[EXECUTE_3_OUTPUT:[0-9]*]] = "tf.TPUExecute"(
+      // CHECK:          tf_device.return %[[EXECUTE_3_OUTPUT]]
+      // CHECK:     %[[CONST_CONCAT_DIM:.*]] = "tf.Const"()
+      // CHECK:     %[[CONCAT_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#0, %[[PARALLEL_EXECUTE_OUTPUT]]#3
+      %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 4, step_marker_location = "", topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\03\1A\03\02\01\02\22\04\00\01\02\030\01", "\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
+      tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
+    }
+    func.return %0#0, %1#0 : tensor<*xi32>, tensor<*xi1>
+  }
+  func.func @tpu0_func(%arg0: tensor<128x10xf32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>) {
+    %1, %2 = "tf.A"(%arg0) : (tensor<128x10xf32>) -> (tensor<*xi32>, tensor<*xi1>)
+    %4 = "tf.B"(%1, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>)
+    %3 = "tf.XlaSharding"(%2) { _XlaSharding = "", sharding = "" } : (tensor<*xi1>) -> tensor<*xi1>
+    func.return %4, %3 : tensor<*xi32>, tensor<*xi1>
+  }
+}
+
+// -----
+
 // Tests inputs device assignment order is well preserved for tiled input sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index 2bbd90a3aeeebc..3d1cf1bd58fa38 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -60,6 +60,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -337,6 +338,7 @@ cc_library(
         "@llvm-project//mlir:MLProgramDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -458,9 +460,6 @@ cc_library(
         "device_index_selector.cc",
         "drop_while_shape_invariant.cc",
         "einsum.cc",
-        "embedding_pipelining.cc",
-        "embedding_program_key.cc",
-        "embedding_sequencing.cc",
         "executor_island_coarsening.cc",
         "executor_tpuv1_inline_tpu_island.cc",
         "executor_tpuv1_island_coarsening.cc",
@@ -652,11 +651,13 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index f265ac68fa5f27..4de43317677f63 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -145,7 +145,7 @@ bool OnlyOperatesOnCompositeDevices(
       continue;
     }
     auto lattice =
-        solver.lookupState<TF::ResourceDataflowState>(arg.get())->getValue();
+        solver.lookupState<TF::IsCompositeDataflowState>(arg.get())->getValue();
     bool is_read = read_array.contains(arg.getOperandNumber());
     bool is_update = update_array.contains(arg.getOperandNumber());
     // We want the resource operands that are on composite devices to be the
@@ -214,7 +214,7 @@ void CollectChainResources(
       // device-specific (see below).
       bool resource_is_on_composite_device = false;
       for (Value value : alias_analysis.GetValuesForResourceId(resource_id)) {
-        auto lattice = solver.lookupState<TF::ResourceDataflowState>(value);
+        auto lattice = solver.lookupState<TF::IsCompositeDataflowState>(value);
         if (lattice) {
           resource_is_on_composite_device |=
               lattice->getValue().is_on_composite_device;
@@ -604,7 +604,7 @@ void ConvertControlToDataOutputsPass::runOnOperation() {
   DataFlowSolver solver;
   solver.load<dataflow::DeadCodeAnalysis>();
   solver.load<dataflow::SparseConstantPropagation>();
-  TF::LoadResourceDataflowAnalysis(solver);
+  TF::LoadIsCompositeDataflowAnalysis(solver);
   if (failed(solver.initializeAndRun(module))) return signalPassFailure();
 
   // This pass assumes that all functions are suitable for export i.e., each
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
index 5d500453d17fe0..f8e75d9032f3e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/BUILD
@@ -26,11 +26,12 @@ cc_library(
     deps = [
         ":runtime_passes",
         "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core/platform:error_payloads",
@@ -62,6 +63,7 @@ tf_cc_test(
         ":lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir:register_common_dialects",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
index 6f46766a3250fa..a239c7304a0ae0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -121,6 +123,7 @@ void CreateNonTPULowerClusterToRuntimeOpsPassPipeline(
 // TODO(b/306728216): Move this out of the Bridge component and into a Host
 // runtime component.
 tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
+                                       std::string bridge_type,
                                        tsl::DeviceType device_type,
                                        absl::Status status) {
   if (status.ok()) {
@@ -129,11 +132,12 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
 
   VLOG(2) << error_prefix << " " << status;
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type.type_string(), /*bridge_version=*/"v2",
+      bridge_type,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV2,
+      device_type.type_string(),
       /*fallback_enabled=*/false,
       /*result=*/"failure");
 
-  constexpr char kBridgeComponent[] = "TFXLABridge";
   std::string bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_TPU_BRIDGE";
 
   tsl::OkOrSetErrorCounterPayload(
@@ -144,7 +148,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
     bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_CPU/GPU_BRIDGE";
   }
 
-  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent, bridge_subcomponent,
                           status.ToString())
       .IgnoreError();
 
@@ -194,10 +198,13 @@ absl::Status RunLowerClusterToRuntimeOpsPassPipeline(
         module, llvm::StringRef(), &runtime_lowering);
   }
 
+  std::string bridge_type = xla_device_type == DeviceType(DEVICE_TPU_XLA_JIT)
+                                ? mlir::TF::kMlirPh1BridgeCounterReplicated
+                                : mlir::TF::kMlirPh1BridgeCounterNonReplicated;
   auto result_status = diag_handler.ConsumeStatus();
   TF_RETURN_IF_ERROR(
       RecordIfErrorStatus(/*error_prefix=*/"lower_cluster_to_runtime",
-                          xla_device_type, result_status));
+                          bridge_type, xla_device_type, result_status));
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
index 3e3e8db504f1da..1f0cf146203de2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/env.h"
@@ -167,9 +168,11 @@ TEST_F(LowerClusterToRuntimeOpsTest, ErrorsWithBadCluster) {
                    *mlir_module_, DeviceType(DEVICE_TPU_XLA_JIT))
                    .ok());
 
-  EXPECT_EQ(compilation_status.Delta("XLA_TPU_JIT", "v2", "fallback_disabled",
-                                     "failure"),
-            1);
+  EXPECT_EQ(
+      compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                               mlir::TF::kMlirPh1BridgeCounterV2, "XLA_TPU_JIT",
+                               "fallback_disabled", "failure"),
+      1);
 }
 
 TEST_F(LowerClusterToRuntimeOpsTest, DumpsPipelinePasses) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 9c475f1f9f5281..da89e77cb0862c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -446,13 +446,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateReplicateToIslandPass(
 std::unique_ptr<OperationPass<func::FuncOp>>
 CreateReplicaIDToDeviceOrdinalPass();
 
-// Creates a pass that adds pipelining to a graph that contains device
-// accelerated embeddings. The EmbeddingSequencingPass is a temporary fallback
-// while developing full pipelining capabilities.
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
-std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
-std::unique_ptr<OperationPass<func::FuncOp>> CreateEmbeddingProgramKeyPass();
-
 // Creates a pass that creates `tf_executor.island` from a single
 // `tf_device.parallel_execute` island.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateParallelExecuteToIslandsPass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
new file mode 100644
index 00000000000000..bff95d357c885f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/BUILD
@@ -0,0 +1,123 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:__pkg__",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:__pkg__",
+        "//tensorflow/compiler/mlir/tf2xla/api:__subpackages__",
+        "//tensorflow/compiler/mlir/tf2xla/internal:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "sparsecore_passes_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=SparseCore",
+            ],
+            "sparsecore_passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "sparsecore_passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "sparsecore_passes",
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    textual_hdrs = [
+        "sparsecore_passes.h.inc",
+    ],
+    deps = [
+        ":embedding_pipelining",
+        ":embedding_program_key",
+        ":embedding_sequencing",
+        ":sparsecore_passes_inc_gen",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "embedding_pipelining",
+    srcs = ["embedding_pipelining.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "embedding_sequencing",
+    srcs = ["embedding_sequencing.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "embedding_program_key",
+    srcs = ["embedding_program_key.cc"],
+    hdrs = [
+        "sparsecore_passes.h",
+    ],
+    deps = [
+        ":sparsecore_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@local_xla//xla/mlir_hlo",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
similarity index 99%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
index ee334b3f032155..0c450126e4e090 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_pipelining.cc
@@ -157,7 +157,7 @@ return selected_results
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define GEN_PASS_DEF_EMBEDDINGPIPELININGPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
 static constexpr char kEmbeddingPipeliningInlineAttr[] =
@@ -1289,7 +1289,7 @@ LogicalResult StartStep0(OpBuilder& builder, Location& loc,
   func::FuncOp orig_parent_func =
       callers.backward->getParentOfType<func::FuncOp>();
 
-  std::vector<Value> operands = loop_operands_nm0;
+  const std::vector<Value>& operands = loop_operands_nm0;
 
   // Input types will be the same as the original loop body.
   std::vector<Type> input_types = GetValueTypes(operands);
@@ -1373,7 +1373,7 @@ LogicalResult StartStep1(OpBuilder& builder, Location& loc,
   func::FuncOp orig_parent_func =
       callers.backward->getParentOfType<func::FuncOp>();
 
-  std::vector<Value> operands = loop_operands_1;
+  const std::vector<Value>& operands = loop_operands_1;
 
   // Input types will be the same as the original loop body.
   std::vector<Type> input_types = GetValueTypes(operands);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
similarity index 99%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
index a5575ef156ddb9..3e41762feb16c2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 
 namespace mlir {
@@ -42,7 +41,7 @@ constexpr char kMiniBatchSplitsAttr[] = "mini_batch_splits";
 constexpr char kMiniBatchCsrAttr[] = "mini_batch_in_csr";
 
 #define GEN_PASS_DEF_EMBEDDINGPROGRAMKEYPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 struct EmbeddingProgramKeyPass
     : public impl::EmbeddingProgramKeyPassBase<EmbeddingProgramKeyPass> {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
similarity index 98%
rename from tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
index a77dd6f498a144..7ed29a3ed58cc3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_sequencing.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Casting.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
@@ -47,17 +50,20 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define GEN_PASS_DEF_EMBEDDINGSEQUENCINGPASS
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
 
 static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
 static constexpr char kEmbeddingForward[] = "forward";
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
new file mode 100644
index 00000000000000..8944745dd3fff9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h
@@ -0,0 +1,50 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFDevice {
+
+// For architectures that support accelerated embedding lookups, this pass will
+// rewrite the graph to use pipelining for better device utilization.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
+
+// This is a strictly sequential and formally correct fallback option for the
+// embedding pipelining pass intended for debugging during pipelining
+// development.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
+
+// Passes in the program key to embedding ops, by moving the embedding ops
+// after the _TPUCompileMlir op.
+std::unique_ptr<OperationPass<func::FuncOp>> CreateEmbeddingProgramKeyPass();
+
+#define GEN_PASS_REGISTRATION
+#define GEN_PASS_DECL_EMBEDDINGSEQUENCINGPASS
+#define GEN_PASS_DECL_EMBEDDINGPIPELININGPASS
+#define GEN_PASS_DECL_EMBEDDINGPROGRAMKEYPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h.inc"
+
+}  // namespace TFDevice
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_SPARSECORE_SPARSECORE_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td
new file mode 100644
index 00000000000000..a9c5981393df6c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.td
@@ -0,0 +1,83 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+include "mlir/Pass/PassBase.td"
+
+def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for embedding pipelining";
+  let constructor = "TFDevice::CreateEmbeddingPipeliningPass()";
+    let description = [{
+    For architectures that support accelerated embedding lookups, this pass will
+    rewrite the graph to use pipelining for better device utilization.
+  }];
+}
+
+def EmbeddingSequencingPass : Pass<"tf-embedding-sequencing", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for sequential execution of embeddings";
+  let constructor = "TFDevice::CreateEmbeddingSequencingPass()";
+    let description = [{
+    This is a strictly sequential and formally correct fallback option for the
+    embedding pipelining pass intended for debugging during pipelining
+    development.
+  }];
+}
+
+def EmbeddingProgramKeyPass : Pass<"tf-embedding-program-key", "mlir::func::FuncOp"> {
+  let summary = "Sets the program key for embedding ops.";
+  let constructor = "TFDevice::CreateEmbeddingProgramKeyPass()";
+    let description = [{
+    Passes in the program key to embedding ops. Will move the embedding ops
+    after a _TPUCompileMlir op if there is no predecessor _TPUCompileMlir op.
+    Both the embedding op and compile op are assumed to be wrapped in separate
+    tf_device.launch() ops. This is because the embedding op is head outside
+    compiled and the compile op is wrapped in launch to execute on host
+    during TPURewritePass.
+
+    For example, the tf.OpA with the `mini_batch_splits` attribute will be
+    moved after _TPUCompileMlir and the first input will use the
+    _TPUCompileMlir program output:
+
+    ```mlir
+    "tf_device.launch"() ({
+     %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
+     tf_device.return
+   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+   %0:2 = "tf_device.launch"() ({
+     %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+   ```
+
+   becomes:
+
+   ```mlir
+     %0:2 = "tf_device.launch"() ({
+       %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+       tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
+     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+     "tf_device.launch"() ({
+       %cst = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+       "tf.OpA"(%0#1) {mini_batch_splits = ""} : (tensor<3x!tf_type.string>) -> ()
+       tf_device.return
+     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
+   ```
+  }];
+
+  let dependentDialects = [
+    "mhlo::MhloDialect",
+    "tf_device::TensorFlowDeviceDialect"
+  ];
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index b00e70eb73c4cc..6b53cae7099688 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -329,73 +329,6 @@ def ReplicaIDToDeviceOrdinalPass : Pass<"tf-replica-id-to-device-ordinal", "mlir
   }];
 }
 
-def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp"> {
-  let summary = "Rewrite graph for embedding pipelining";
-  let constructor = "TFDevice::CreateEmbeddingPipeliningPass()";
-    let description = [{
-    For architectures that support accelerated embedding lookups, this pass will
-    rewrite the graph to use pipelining for better device utilization.
-  }];
-}
-
-def EmbeddingProgramKeyPass : Pass<"tf-embedding-program-key", "mlir::func::FuncOp"> {
-  let summary = "Sets the program key for embedding ops.";
-  let constructor = "TFDevice::CreateEmbeddingProgramKeyPass()";
-    let description = [{
-    Passes in the program key to embedding ops. Will move the embedding ops
-    after a _TPUCompileMlir op if there is no predecessor _TPUCompileMlir op.
-    Both the embedding op and compile op are assumed to be wrapped in separate
-    tf_device.launch() ops. This is because the embedding op is head outside
-    compiled and the compile op is wrapped in launch to execute on host
-    during TPURewritePass.
-
-    For example, the tf.OpA with the `mini_batch_splits` attribute will be
-    moved after _TPUCompileMlir and the first input will use the
-    _TPUCompileMlir program output:
-
-    ```mlir
-    "tf_device.launch"() ({
-     %cst_0 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-     "tf.OpA"(%cst_0) { mini_batch_splits = ""} : (tensor<1x!tf_type.string>) -> ()
-     tf_device.return
-   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
-   %0:2 = "tf_device.launch"() ({
-     %compilation_status, %program = "tf._TPUCompileMlir"() { metadata = "...", mlir_module = "..." } : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-   }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-   ```
-
-   becomes:
-
-   ```mlir
-     %0:2 = "tf_device.launch"() ({
-       %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-       tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-     "tf_device.launch"() ({
-       %cst = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
-       "tf.OpA"(%0#1) {mini_batch_splits = ""} : (tensor<3x!tf_type.string>) -> ()
-       tf_device.return
-     }) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
-   ```
-  }];
-
-  let dependentDialects = [
-    "mhlo::MhloDialect",
-    "tf_device::TensorFlowDeviceDialect"
-  ];
-}
-
-def EmbeddingSequencingPass : Pass<"tf-embedding-sequencing", "mlir::ModuleOp"> {
-  let summary = "Rewrite graph for sequential execution of embeddings";
-  let constructor = "TFDevice::CreateEmbeddingSequencingPass()";
-    let description = [{
-    This is a strictly sequential and formally correct fallback option for the
-    embedding pipelining pass intended for debugging during pipelining
-    development.
-  }];
-}
-
 def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
   Pass<"tf-readonly-references-to-resources", "mlir::func::FuncOp"> {
   let summary = "Convert readonly reference variables to resource variables.";
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/BUILD b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
index 46af8590c8108e..59d7cfd7081106 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/translate/BUILD
@@ -122,6 +122,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
+        "@local_tsl//tsl/platform:protobuf",
     ],
     alwayslink = 1,
 )
@@ -287,6 +288,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TranslateLib",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_xla//xla/client:client_library",
         "@local_xla//xla/client:compile_only_client",
         "@local_xla//xla/service/cpu:cpu_compiler",
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 6042ae37ee8fa2..523048cd7cd582 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -421,7 +421,7 @@ Status Exporter::AddInstructionNode(Operation* inst) {
                           inst, name, /*ignore_unregistered_attrs=*/false));
   UseOriginalFunctionNames(*node_def);
 
-  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   DCHECK(node != nullptr);
   nodes_[inst] = node;
   return OkStatus();
@@ -436,7 +436,7 @@ bool IsEntryFunctionArg(BlockArgument arg) {
 Status Exporter::AddArgumentNode(BlockArgument arg, unsigned index,
                                  llvm::StringRef name) {
   TF_ASSIGN_OR_RETURN(auto node_def, GetArgumentNode(arg, index, name));
-  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+  TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
   args_[arg] = node;
   return OkStatus();
 }
@@ -455,7 +455,7 @@ Status Exporter::AddFetchNode(FuncOp function, mlir::tf_executor::FetchOp fetch,
         GetReturnNode(function, operand_and_idx.value(),
                       operand_and_idx.index(),
                       names.empty() ? "" : names[operand_and_idx.index()]));
-    TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(*node_def));
+    TF_ASSIGN_OR_RETURN(Node * node, graph_->AddNode(std::move(*node_def)));
     return_nodes.push_back(node);
   }
   return OkStatus();
@@ -687,15 +687,6 @@ Status Exporter::ConvertLibFunction(
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*sub_graph, function_name, control_ret, &func_def));
 
-  // The node defs in FunctionDef might contain debug info which was added
-  // by the GraphToFunctionDef method. We should remove it if we don't want
-  // to export them to avoid failing the roundtrip test.
-  if (!configs.export_debug_info) {
-    for (auto& node_def : *func_def.mutable_node_def()) {
-      node_def.clear_experimental_debug_info();
-    }
-  }
-
   // Checks for gradient attribute. If present converts the gradient function
   // and populates the GradientDef.
   auto grad_string = mlir::TF::TensorFlowDialect::GetGradientAttrName();
@@ -831,17 +822,6 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
 
   auto graphdef = std::make_unique<GraphDef>();
   graph->ToGraphDef(graphdef.get());
-  if (!configs.export_library) graphdef->clear_library();
-  if (!configs.export_shapes) {
-    for (auto& node_def : *graphdef->mutable_node()) {
-      node_def.mutable_attr()->erase("shape");
-    }
-  }
-  if (!configs.export_debug_info) {
-    for (auto& node_def : *graphdef->mutable_node()) {
-      node_def.clear_experimental_debug_info();
-    }
-  }
   return graphdef;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index 00fd5b7de6aa4d..fca039c2601636 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -102,12 +102,6 @@ struct GraphImportConfig {
 };
 
 struct GraphExportConfig {
-  // Whether to export shape attribute for the NodeDefs in the GraphDef.
-  bool export_shapes = true;
-  // Whether to export library field in the GraphDef.
-  bool export_library = true;
-  // Whether to export debug original node name in the GraphDef.
-  bool export_debug_info = true;
   // Whether to export the entry function to function library instead of the
   // graph.
   bool export_entry_func_to_flib = false;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 57b0d0e2ff2389..eb9bf3db34106d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 using tsl::Status;
@@ -152,7 +153,7 @@ static LogicalResult MlirToGraphTranslateFunction(ModuleOp module,
   // Print the graph to the output after going through GraphDef conversion.
   // The DumpGraphToFile would do this anyway so just skip straight to it.
   graph->ToGraphDef(graphdef.get());
-  output << graphdef->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*graphdef);
 
   return success();
 }
@@ -167,7 +168,6 @@ static LogicalResult MlirToGraphdefTranslateFunction(
     ModuleOp module, llvm::raw_ostream& output) {
   if (!module) return failure();
 
-  // TODO(fengliuai): Add exporter flags.
   tensorflow::GraphExportConfig confs;
   confs.export_entry_func_to_flib = export_entry_func_to_flib;
   confs.export_original_tf_func_name = export_original_tf_func_name;
@@ -179,7 +179,7 @@ static LogicalResult MlirToGraphdefTranslateFunction(
     return mlir::failure();
   }
 
-  output << graphdef_or.value()->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*graphdef_or.value());
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index 08cbb51a576760..856db032e501ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tsl/platform/protobuf.h"
 
 namespace mlir {
 static mlir::Operation* ExtractOnlyOp(mlir::ModuleOp module) {
@@ -61,7 +62,7 @@ static LogicalResult MlirToTfNodeDef(ModuleOp module,
     return failure();
   }
 
-  output << node_def_or.value()->DebugString();
+  output << tsl::LegacyUnredactedDebugString(*node_def_or.value());
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
index b50135c9bdfac3..5a99806d4295f3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h
@@ -121,6 +121,18 @@ inline constexpr llvm::StringRef kDynamicArgIndexAttr = "_dynamic_arg_index";
 inline constexpr llvm::StringRef kParallelExecAnnotation =
     "_parallel_execution_ids";
 
+// Logging
+
+// Name of component for error logging. This name is fixed and required to
+// enable logging.
+inline const char kBridgeComponent[] = "TFXLABridge";
+inline const char kMlirPh1BridgeCounterReplicated[] = "replicated";
+inline const char kMlirPh1BridgeCounterNonReplicated[] = "nonreplicated";
+inline const char kMlirPh1BridgeCounterV1[] = "v1";
+inline const char kMlirPh1BridgeCounterV2[] = "v2";
+inline const char kMlirPh1BridgeCounterTpu[] = "tpu";
+inline const char kMlirPh1BridgeCounterNonTpu[] = "cpu/gpu";
+
 // Copies attributes that satisfy the given predicate from `from` to `to`.
 template <typename Predicate>
 void CopyAttributes(Operation *from, Operation *to, Predicate P) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index bb474b1413f7ac..2efd63b29b04ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -125,7 +125,7 @@ TEST(DumpCrashReproducerTest, RoundtripDumpAndReadValid) {
   EXPECT_TRUE(mlir::MlirOptMain(output_stream->os(), std::move(input_file),
                                 registry,
                                 mlir::MlirOptMainConfig{}
-                                    .splitInputFile(false)
+                                    .splitInputFile("")
                                     .verifyDiagnostics(false)
                                     .verifyPasses(false)
                                     .allowUnregisteredDialects(false)
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index f9dc740cee1aae..f01a3f0e09d19b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace {
@@ -395,12 +396,12 @@ Status ConvertAttributes(
     if (auto symbol_ref = attr.dyn_cast<mlir::SymbolRefAttr>()) {
       TF_RETURN_IF_ERROR(
           ConvertAttribute(symbol_ref.cast<mlir::FlatSymbolRefAttr>(), &value));
-      func_call_attrs[string(name)] = value;
+      func_call_attrs[string(name)] = std::move(value);
       continue;
     }
     if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
       TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
-      func_call_attrs[string(name)] = value;
+      func_call_attrs[string(name)] = std::move(value);
       continue;
     }
     if (attr.isa<mlir::AffineMapAttr>()) {
@@ -434,13 +435,14 @@ Status ConvertAttributes(
     TF_RET_CHECK(name_tokens.size() <= 2);
     auto it = func_call_attrs.find(name_tokens[0]);
     if (it == func_call_attrs.end()) {
-      (*values)[string(name)] = value;
+      (*values)[string(name)] = std::move(value);
     } else {
-      (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] = value;
+      (*it->second.mutable_func()->mutable_attr())[name_tokens[1]] =
+          std::move(value);
     }
   }
-  for (const auto& it : func_call_attrs) {
-    (*values)[it.first] = it.second;
+  for (auto& it : func_call_attrs) {
+    (*values)[it.first] = std::move(it.second);
   }
   return OkStatus();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 58adaa41349b14..ea76adb284b7e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 
+#include <cstdint>
+#include <map>
 #include <numeric>
 #include <string>
 #include <utility>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -152,16 +157,21 @@ mlir::LogicalResult HandleTileShardedInputs(
   // are created such that input data is sharded in row major order.
   // Split nodes at ith depth from the original input node represent nodes
   // that split the input data at i-th dimension.
-  const auto& dimension_splits = input_sharding.tile_assignment_dimensions();
-  for (const auto& num_splits_and_index : llvm::enumerate(dimension_splits)) {
-    const int num_splits = num_splits_and_index.value();
-    const int dimension_index = num_splits_and_index.index();
-    if (num_splits == 1) continue;
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(input_sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (const auto& dimension_and_num_splits : *dimension_to_splits_map) {
+    const int dimension = dimension_and_num_splits.first;
+    const int num_splits = dimension_and_num_splits.second;
 
     // Creates root split op.
     if (split_ops_for_tiled_input.empty()) {
       mlir::TF::SplitOp root_split_op;
-      auto result = CreateSplitOp(num_splits, dimension_index, location,
+      auto result = CreateSplitOp(num_splits, dimension, location,
                                   original_source, builder, &root_split_op);
       if (mlir::failed(result)) return mlir::failure();
 
@@ -176,7 +186,7 @@ mlir::LogicalResult HandleTileShardedInputs(
       for (auto parent_split_output_value : split_op.getResults()) {
         mlir::TF::SplitOp child_split_op;
         auto result =
-            CreateSplitOp(num_splits, dimension_index, location,
+            CreateSplitOp(num_splits, dimension, location,
                           parent_split_output_value, builder, &child_split_op);
         if (mlir::failed(result)) return mlir::failure();
 
@@ -188,12 +198,21 @@ mlir::LogicalResult HandleTileShardedInputs(
   }
 
   // `split_ops_for_tiled_input` now includes final split nodes
-  // from which sharded data will be fed into TPUExcute ops -- sorted by
+  // from which sharded data will be fed into TPUExecute ops -- sorted by
   // row major order.
+  tiled_inputs->clear();
   tiled_inputs->reserve(input_sharding.tile_assignment_devices_size());
-  for (auto split_op : split_ops_for_tiled_input)
-    tiled_inputs->append(split_op.getResults().begin(),
-                         split_op.getResults().end());
+  for (auto split_op : split_ops_for_tiled_input) {
+    for (auto split_op_output : split_op.getResults()) {
+      int64_t repeat_count =
+          input_sharding.replicate_on_last_tile_dim()
+              ? *input_sharding.tile_assignment_dimensions().rbegin()
+              : 1;
+      for (int64_t i = 0; i < repeat_count; ++i) {
+        tiled_inputs->push_back(split_op_output);
+      }
+    }
+  }
 
   return mlir::success();
 }
@@ -205,6 +224,29 @@ bool UnsupportedPartitionedShardingType(xla::OpSharding::Type sharding) {
 
 }  // namespace
 
+absl::StatusOr<std::map<int, int>> GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding) {
+  int64_t tensor_tile_rank = sharding.tile_assignment_dimensions_size();
+  if (sharding.replicate_on_last_tile_dim()) {
+    tensor_tile_rank--;
+  }
+
+  std::map<int, int> dimension_to_splits_map;
+  for (int dim_index = 0; dim_index < tensor_tile_rank; ++dim_index) {
+    if (sharding.tile_assignment_dimensions(dim_index) > 1) {
+      dimension_to_splits_map.emplace(
+          dim_index, sharding.tile_assignment_dimensions(dim_index));
+    }
+  }
+
+  if (dimension_to_splits_map.empty()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Arg has unnecessary tiled sharding: ", sharding.DebugString()));
+  }
+
+  return dimension_to_splits_map;
+}
+
 int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding) {
   return xla_sharding.tile_assignment_dimensions_size() -
          (xla_sharding.replicate_on_last_tile_dim() ? 1 : 0) -
@@ -478,15 +520,25 @@ mlir::LogicalResult GetTileShardedOutputsToMerge(
   const xla::OpSharding& sharding =
       output_sharding_config[cluster_func_output_index];
   outputs_to_merge->reserve(sharding.tile_assignment_devices_size());
-  for (const auto logical_device_id : sharding.tile_assignment_devices()) {
+  for (const auto& core_id_and_index :
+       llvm::enumerate(sharding.tile_assignment_devices())) {
+    auto core_id = core_id_and_index.value();
+    auto tile_index = core_id_and_index.index();
+
+    int last_tile_dim_size = *sharding.tile_assignment_dimensions().rbegin();
+    if (sharding.replicate_on_last_tile_dim() &&
+        tile_index % last_tile_dim_size != 0) {
+      continue;
+    }
+
     int region_output_index;
-    auto status = LookupClusterToCoreIndex(
-        location, cluster_to_core_index, logical_device_id,
-        cluster_func_output_index, &region_output_index);
+    auto status = LookupClusterToCoreIndex(location, cluster_to_core_index,
+                                           core_id, cluster_func_output_index,
+                                           &region_output_index);
     if (failed(status)) return mlir::failure();
     const auto output_from_logical_device =
-        new_parallel_execute.GetRegionOutputs(
-            cluster_idx + logical_device_id)[region_output_index];
+        new_parallel_execute.GetRegionOutputs(cluster_idx +
+                                              core_id)[region_output_index];
     outputs_to_merge->emplace_back(output_from_logical_device);
   }
 
@@ -518,12 +570,18 @@ mlir::LogicalResult HandleTileShardedOutputs(
   // devices to a single replica output.
   const xla::OpSharding& sharding =
       output_sharding_config[cluster_func_output_index];
-  int concat_dimension = sharding.tile_assignment_dimensions_size() - 1;
-  for (auto num_splits : llvm::reverse(sharding.tile_assignment_dimensions())) {
-    if (num_splits == 1) {
-      --concat_dimension;
-      continue;
-    }
+
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (auto it = dimension_to_splits_map->rbegin();
+       it != dimension_to_splits_map->rend(); ++it) {
+    int concat_dimension = it->first;
+    int num_splits = it->second;
 
     llvm::SmallVector<mlir::Value, 4> new_outputs;
     new_outputs.reserve(num_splits);
@@ -539,7 +597,6 @@ mlir::LogicalResult HandleTileShardedOutputs(
     }
 
     std::swap(new_outputs, outputs_to_merge);
-    --concat_dimension;
   }
 
   assert(outputs_to_merge.size() == 1);
@@ -552,33 +609,35 @@ mlir::LogicalResult ValidateAndGetTiledExecuteOutputShape(
     const mlir::TensorType cluster_func_output_type,
     const xla::OpSharding& output_sharding,
     mlir::Type* tiled_logical_computation_type) {
-  auto new_output_shape =
-      llvm::to_vector<4>(cluster_func_output_type.getShape());
-  for (const auto& dimension_and_output_splits :
-       llvm::enumerate(output_sharding.tile_assignment_dimensions())) {
-    const auto dimension_index = dimension_and_output_splits.index();
-    const auto output_splits = dimension_and_output_splits.value();
-    const auto output_shape = cluster_func_output_type.getShape();
-
-    if (output_shape[dimension_index] == mlir::ShapedType::kDynamic) {
+  const auto output_shape = cluster_func_output_type.getShape();
+  auto new_output_shape = llvm::to_vector<4>(output_shape);
+  auto dimension_to_splits_map =
+      GetDimensionIndicesAndNumSplitsFromSharding(output_sharding);
+  if (!dimension_to_splits_map.ok()) {
+    LOG(ERROR) << dimension_to_splits_map.status();
+    return mlir::failure();
+  }
+
+  for (const auto& dimension_and_output_splits : *dimension_to_splits_map) {
+    const auto dimension = dimension_and_output_splits.first;
+    const auto output_splits = dimension_and_output_splits.second;
+
+    if (output_shape[dimension] == mlir::ShapedType::kDynamic) {
       *tiled_logical_computation_type = cluster_func_output_type;
       break;
     }
 
-    auto output_shape_at_dim =
-        cluster_func_output_type.getShape()[dimension_index];
-    if (output_shape_at_dim % output_splits != 0) {
+    if (output_shape[dimension] % output_splits != 0) {
       mlir::emitError(
           location,
           llvm::formatv("incorrect output sharding received. "
                         "{0}-th dimension of the output must be "
                         "evenly divisible by {1}, got dimension "
                         "shape {2}",
-                        dimension_index, output_splits, output_shape_at_dim));
+                        dimension, output_splits, output_shape[dimension]));
     }
 
-    new_output_shape[dimension_index] =
-        output_shape[dimension_index] / output_splits;
+    new_output_shape[dimension] = output_shape[dimension] / output_splits;
   }
 
   *tiled_logical_computation_type = mlir::RankedTensorType::get(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
index 6295be3776416e..ab22eb978214ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
 
+#include <map>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -122,6 +124,9 @@ bool IsSplitSharding(const xla::OpSharding& sharding);
 // REPLICATED type and replicated OTHER type.
 bool IsReplicatedSharding(const xla::OpSharding& sharding);
 
+// Returns a map of dimension indices and number of splits for tiled sharding.
+absl::StatusOr<std::map<int, int>> GetDimensionIndicesAndNumSplitsFromSharding(
+    const xla::OpSharding& sharding);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 38094bf7067d1b..53a65bd3ae3662 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -77,16 +77,28 @@ tf_cc_test(
     srcs = ["compile_mlir_util_test.cc"],
     deps = [
         ":compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
+        "//tensorflow/compiler/mlir/utils:array_container_utils",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/monitoring:cell_reader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/client:xla_builder",
     ],
 )
 
@@ -182,6 +194,7 @@ cc_library(
     ],
     deps = [
         ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
@@ -220,6 +233,7 @@ tf_cc_test(
     deps = [
         ":cluster_tf",
         "//tensorflow/compiler/mlir:register_common_dialects",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
@@ -229,7 +243,6 @@ tf_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/lib/monitoring:test_utils",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
index 09209d8673524c..38c11ec857f072 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -60,10 +61,6 @@ using mlir::func::FuncOp;
 
 namespace {
 
-// Name of component for error logging. This name is fixed and required to
-// enable logging.
-constexpr char kBridgeComponent[] = "TFXLABridge";
-
 void CreateReplicatedBridgePipelineV1(OpPassManager &pm) {
   pm.addPass(mlir::tf2xla::internal::CreateInferenceMetricsPass());
 
@@ -152,10 +149,12 @@ tensorflow::Status RecordStatusIfError(const std::string error_prefix,
   }
 
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      /*device_type=*/"tpu", /*bridge_version=*/"v1",
+      /*bridge_type=*/mlir::TF::kMlirPh1BridgeCounterReplicated,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV1,
+      /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
       /*fallback_enabled=*/is_in_fallback_enabled_mode,
       /*result=*/"failure");
-  tsl::error_logging::Log(kBridgeComponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent,
                           "TFXLA_PHASE_ONE_MLIR_TPU_V1_COMPAT_BRIDGE",
                           status.ToString())
       .IgnoreError();
@@ -221,7 +220,9 @@ tensorflow::Status RunSessionTf2xlaClusteringBridge(
       RunClusteringPipelineOnSubmodule(module, is_in_fallback_enabled_mode));
 
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      /*device_type=*/"tpu", /*bridge_version=*/"v1",
+      /*bridge_type=*/mlir::TF::kMlirPh1BridgeCounterReplicated,
+      /*bridge_version=*/mlir::TF::kMlirPh1BridgeCounterV1,
+      /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
       /*n_fallback_enabled*/ is_in_fallback_enabled_mode,
       /*result=*/"success");
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
index 44eafb25f579c8..e674989d2174ba 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -84,8 +85,11 @@ TEST_F(SessionClusterTensorflowDialectTest, ClustersTf) {
   TF_EXPECT_OK(
       RunSessionTf2xlaClusteringBridge(*mlir_module_,
                                        /*is_in_fallback_enabled_mode=*/false));
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v1", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV1,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(SessionClusterTensorflowDialectTest, FailsWithMultipleSubmodules) {
@@ -98,8 +102,11 @@ TEST_F(SessionClusterTensorflowDialectTest, FailsWithMultipleSubmodules) {
                                        /*is_in_fallback_enabled_mode=*/false)
           .ok());
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v1", "fallback_disabled", "failure"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV1,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "failure"),
+            1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
index 20fff0cc549d0f..59fb22e87eab58 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.cc
@@ -989,7 +989,8 @@ Status CompileGraphToXlaHlo(
 }
 
 absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphToModule(
-    const Graph& graph, llvm::ArrayRef<std::string> control_rets,
+    bool unconditionally_use_set_output_shapes, const Graph& graph,
+    llvm::ArrayRef<std::string> control_rets,
     const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
     mlir::MLIRContext* context) {
   mlir::DialectRegistry registry;
@@ -1004,20 +1005,27 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphToModule(
   // the shape inference pass is run early in the pass pipeline, shape inference
   // during import is not necessary.
   config.enable_shape_inference = false;
+  // Some graphs may require _output_shapes (an unregistered attribute)
+  // to override shapes. It is unfortunately not always set correctly so only
+  // do it optionally.
+  config.unconditionally_use_set_output_shapes =
+      unconditionally_use_set_output_shapes;
   return ConvertGraphToMlir(graph, debug_info, flib_def, config, context);
 }
 
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
-    std::vector<xla::XlaOp>& returns, llvm::ArrayRef<XlaArgument> args,
-    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    std::vector<xla::XlaOp>& returns, bool unconditionally_use_output_shapes,
+    llvm::ArrayRef<XlaArgument> args, llvm::ArrayRef<std::string> control_rets,
+    llvm::StringRef device_type, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes) {
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
-      GraphToModule(graph, control_rets, flib_def, debug_info, &mlir_context));
+      GraphToModule(unconditionally_use_output_shapes, graph, control_rets,
+                    flib_def, debug_info, &mlir_context));
   return BuildHloFromModule(module.get(), builder, xla_params, returns, args,
                             device_type, custom_legalization_passes);
 }
@@ -1034,7 +1042,8 @@ Status CompileGraphToXlaHlo(
   mlir::MLIRContext context;
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
-      GraphToModule(graph, control_rets, flib_def, debug_info, &context));
+      GraphToModule(/*unconditionally_use_set_output_shapes=*/false, graph,
+                    control_rets, flib_def, debug_info, &context));
   return CompileGraphToXlaHlo(
       module.get(), args, device_type, use_tuple_args, enable_op_fallback,
       /*use_return_tuple=*/true, shape_determination_fns, compilation_result,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
index 3f6e446ca28fd9..aaccd39a3db398 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h
@@ -191,7 +191,7 @@ Status CompileGraphToXlaHlo(
 // Compiles a TensorFlow Graph into XLA HLO, generates all accompanying metadata
 // and stores them in CompilationResult.
 ABSL_DEPRECATED(
-    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
 Status CompileGraphToXlaHlo(
     const Graph& graph, llvm::ArrayRef<XlaArgument> args,
     llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
@@ -206,14 +206,17 @@ Status CompileGraphToXlaHlo(
 // XlaBuilder. This function adds HLO to a larger HLO computation, so
 // HLO-level inputs are supplied, and HLO-level outputs are produced.
 // xla_params is the HLO-level inputs and returns is the HLO-level outputs.
+// If unconditionally_use_output_shapes is true then the unregistered
+// attribute _output_shapes is always used to set the output shapes of the ops.
 ABSL_DEPRECATED(
-    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHloinstead.")
+    "Use v1/compile_tf_graph.h::CompileTensorflowGraphToHlo instead.")
 Status BuildHloFromGraph(
     const Graph& graph, xla::XlaBuilder& builder,
     mlir::MLIRContext& mlir_context, llvm::ArrayRef<xla::XlaOp> xla_params,
-    std::vector<xla::XlaOp>& returns, llvm::ArrayRef<XlaArgument> args,
-    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    std::vector<xla::XlaOp>& returns, bool unconditionally_use_output_shapes,
+    llvm::ArrayRef<XlaArgument> args, llvm::ArrayRef<std::string> control_rets,
+    llvm::StringRef device_type, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
     llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
         custom_legalization_passes = {});
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index d7d8e8e4f4e894..62fbf4bb94381f 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -15,21 +15,37 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 
+#include <initializer_list>
+#include <memory>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/client/xla_builder.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -182,5 +198,59 @@ TEST(LegalizeMlirTest, LegalizesModuleWithDynamicShape) {
   EXPECT_TRUE(status.ok());
 }
 
+absl::StatusOr<std::unique_ptr<Graph>> BuildOpGraphWithOutputShapes() {
+  DataType data_type = DT_INT32;
+  std::initializer_list<int64_t> dims = {2, 3, 4, 5};
+  Tensor tensor(data_type, TensorShape(dims));
+  for (int i = 0; i < 2 * 3 * 4 * 5; ++i) {
+    tensor.flat<int32>()(i) = i;
+  }
+
+  NodeDef node;
+  auto builder = NodeDefBuilder("some_node", "Const")
+                     .Attr("dtype", data_type)
+                     .Attr("value", tensor);
+  // Create a bad output shape attr.
+  AttrValue shape_attr;
+  TensorShapeProto* shape_proto = shape_attr.mutable_list()->add_shape();
+  shape_proto->add_dim()->set_size(1);
+  builder.Attr("_output_shapes", shape_attr);
+
+  TF_RETURN_IF_ERROR(builder.Finalize(&node));
+
+  return CreateSingleOpGraph(node, {}, {DataType::DT_INT32});
+}
+
+absl::Status BuildHloFromGraph(Graph& graph, bool use_output_shapes) {
+  xla::XlaBuilder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+  mlir::MLIRContext mlir_context;
+  llvm::SmallVector<xla::XlaOp, 4> xla_params;
+  std::vector<xla::XlaOp> returns(1);
+  return BuildHloFromGraph(graph, builder, mlir_context, xla_params, returns,
+                           use_output_shapes, /*args=*/{},
+                           /*control_rets=*/{}, DEVICE_TPU,
+                           FunctionLibraryDefinition(OpRegistry::Global()),
+                           /*debug_info=*/{},
+                           /*custom_legalization_passes=*/{});
+}
+
+TEST(CompileMlirUtil, UsesCorrectOriginalShapeWithoutOutputShapes) {
+  TF_ASSERT_OK_AND_ASSIGN(auto graph, BuildOpGraphWithOutputShapes());
+
+  auto build_result = BuildHloFromGraph(*graph, /*use_output_shapes=*/false);
+  TF_ASSERT_OK(build_result);
+}
+
+TEST(CompileMlirUtil, UsesIncorrectOutputShapesWhenPresent) {
+  TF_ASSERT_OK_AND_ASSIGN(auto graph, BuildOpGraphWithOutputShapes());
+
+  auto build_result = BuildHloFromGraph(*graph, /*use_output_shapes=*/true);
+  ASSERT_FALSE(build_result.ok());
+  EXPECT_THAT(build_result.message(),
+              HasSubstr("op operand type 'tensor<2x3x4x5xi32>' and result type "
+                        "'tensor<1xi32>' are cast incompatible"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index a92239e8dbba69..545203ad20ea23 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -119,12 +119,11 @@ cc_library(
     ],
     deps = [
         ":device_type_proto_cc",
-        ":tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
-        "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla/internal:clustering_bridge_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
         "//tensorflow/core:framework",
@@ -133,7 +132,6 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:stacktrace",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
@@ -143,7 +141,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:error_logging",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -159,6 +156,7 @@ tf_cc_test(
         ":cluster_tf",
         "//tensorflow/compiler/mlir:register_common_dialects",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/platform:resource_loader",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
index 23480374032aaa..41df5eb0750459 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
@@ -52,8 +53,6 @@ using mlir::OpPassManager;
 using mlir::PassManager;
 using mlir::func::FuncOp;
 
-constexpr char kBridgeComponent[] = "TFXLABridge";
-
 // Run the TF XLA Bridge based on the input pipeline, which can be either TPU
 // bridge pipeline or non TPU bridge pipeline.
 tensorflow::Status RunTFXLABridge(
@@ -114,6 +113,7 @@ tensorflow::Status RunTFXLABridge(
 
 tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
                                        bool fallback_enabled,
+                                       std::string bridge_type,
                                        std::string device_type,
                                        absl::Status status) {
   if (status.ok()) {
@@ -122,7 +122,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
 
   VLOG(2) << error_prefix << " " << status;
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type, /*bridge_version=*/"v2",
+      /*bridge_type*/ bridge_type, /*bridge_version=*/"v2", device_type,
       /*fallback_enabled=*/fallback_enabled,
       /*result=*/"failure");
 
@@ -135,7 +135,7 @@ tensorflow::Status RecordIfErrorStatus(const std::string error_prefix,
     bridge_subcomponent = "TFXLA_PHASE_ONE_MLIR_CPU/GPU_BRIDGE";
   }
 
-  tsl::error_logging::Log(kBridgeComponent, bridge_subcomponent,
+  tsl::error_logging::Log(mlir::TF::kBridgeComponent, bridge_subcomponent,
                           status.ToString())
       .IgnoreError();
 
@@ -162,8 +162,9 @@ void CreateReplicatedClusteringPipelineV2(OpPassManager &pm) {
 tensorflow::Status RunFunctionTf2xlaClusteringBridge(
     ModuleOp module, bool is_supported_by_replicated_brige,
     bool is_in_fallback_enabled_mode, llvm::StringRef module_name) {
-  std::string device_type_filter =
-      is_supported_by_replicated_brige ? "tpu" : "cpu/gpu";
+  std::string device_type = is_supported_by_replicated_brige
+                                ? mlir::TF::kMlirPh1BridgeCounterTpu
+                                : mlir::TF::kMlirPh1BridgeCounterNonTpu;
 
   VLOG(2)
       << (is_supported_by_replicated_brige ? "Replicated" : "NonReplicated")
@@ -186,14 +187,17 @@ tensorflow::Status RunFunctionTf2xlaClusteringBridge(
                 },
                 module_name, /*dump_prefix=*/"tf_xla_bridge_v2_nonreplicated");
 
+  std::string bridge_type = is_supported_by_replicated_brige
+                                ? mlir::TF::kMlirPh1BridgeCounterReplicated
+                                : mlir::TF::kMlirPh1BridgeCounterNonReplicated;
   // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   TF_RETURN_IF_ERROR(RecordIfErrorStatus(
       /*error_prefix=*/"clustering_v2", is_in_fallback_enabled_mode,
-      device_type_filter, clustering_status));
+      bridge_type, device_type, clustering_status));
 
   // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-      device_type_filter, /*bridge_version=*/"v2",
+      bridge_type, /*bridge_version=*/"v2", device_type,
       /*fallback_enabled=*/is_in_fallback_enabled_mode,
       /*result=*/"success");
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index c4a96702533c49..a5f64a91cd8cb4 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/register_common_dialects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -94,8 +95,11 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTfReplicatedBridge) {
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(FunctionClusterTensorflowDialectTest,
@@ -118,8 +122,11 @@ TEST_F(FunctionClusterTensorflowDialectTest,
   });
 
   EXPECT_TRUE(has_cluster_op);
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_disabled", "success"),
+            1);
 }
 
 TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFNonReplicatedBridge) {
@@ -135,7 +142,10 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFNonReplicatedBridge) {
   ASSERT_TRUE(main);
 
   EXPECT_EQ(
-      compilation_status.Delta("cpu/gpu", "v2", "fallback_disabled", "success"),
+      compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterNonReplicated,
+                               mlir::TF::kMlirPh1BridgeCounterV2,
+                               mlir::TF::kMlirPh1BridgeCounterNonTpu,
+                               "fallback_disabled", "success"),
       1);
 }
 
@@ -148,8 +158,11 @@ TEST_F(FunctionClusterTensorflowDialectTest, LogsFallbackMode) {
       *mlir_module_, /*is_supported_by_replicated_brige*/ true,
       /*is_in_fallback_enabled_mode=*/true));
 
-  EXPECT_EQ(
-      compilation_status.Delta("tpu", "v2", "fallback_enabled", "success"), 1);
+  EXPECT_EQ(compilation_status.Delta(mlir::TF::kMlirPh1BridgeCounterReplicated,
+                                     mlir::TF::kMlirPh1BridgeCounterV2,
+                                     mlir::TF::kMlirPh1BridgeCounterTpu,
+                                     "fallback_enabled", "success"),
+            1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index 246481c5cab7db..7e937d2ce49f8b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -187,6 +187,7 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tensorflow/transforms/sparsecore:sparsecore_passes",
         "//tensorflow/compiler/mlir/tf2xla/internal/passes:clustering_passes",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index 603d928daf9032..e289934b69fbe0 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index b1d7863e860aa6..4c6f68a3419656 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/BUILD b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
index 97bb01c30d1855..c68c485954de1b 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 28a459ccff2eac..b76b52c9fd774a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -2,10 +2,10 @@
 #    TF2XLA Bridge transforms
 
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -196,6 +196,7 @@ cc_library(
         "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla/mlir_hlo",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -286,6 +287,7 @@ cc_library(
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:side_effect_util",
@@ -293,13 +295,15 @@ cc_library(
         "@local_xla//xla/client:padding",
         "@local_xla//xla/client:sharding_builder",
         "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:chlo_legalize_to_hlo",
         "@local_xla//xla/mlir_hlo:convert_op_folder",
+        "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@local_xla//xla/mlir_hlo:type_conversion",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
         "@local_xla//xla/stream_executor/tpu:tpu_api",
         "@local_xla//xla/translate/hlo_to_mhlo:attribute_importer",
         "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
         "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
index 3d46b98d9bac90..816b9a5e8b7706 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -34,6 +35,7 @@ ConversionTarget GetDefaultLegalConversionTargets(MLIRContext& mlir_context,
 
   if (legalize_chlo) {
     target.addIllegalDialect<chlo::ChloDialect>();
+    target.addIllegalDialect<stablehlo::StablehloDialect>();
   } else {
     target.addLegalDialect<chlo::ChloDialect>();
   }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index 7336c8fe625447..d3c9ff7e8bd157 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/rewriters.h"
+#include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace mlir {
@@ -203,9 +205,9 @@ LogicalResult legalizeTF(Operation *op, bool legalize_chlo,
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
+  stablehlo::StablehloToHloTypeConverter hlo_converter;
   if (legalize_chlo) {
-    chlo::populateDecomposeChloPatterns(context, &patterns);
-    chlo::populateChloBroadcastingPatterns(context, &patterns);
+    chlo::populateChloToHloPatterns(context, &hlo_converter, &patterns);
   }
   // ConstantLike op is convenient to create splat constants, but is
   // canonicalized to plain HLO constant if statically shaped. Add the
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
index 4c3f664af9cb83..19c31018185c82 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
@@ -45,12 +45,16 @@ def LegalizeTF : Pass<"xla-legalize-tf", "ModuleOp"> {
   ];
 
   let constructor = "mlir::mhlo::createLegalizeTFPass()";
-  let dependentDialects = ["arith::ArithDialect, chlo::ChloDialect",
-                           "mhlo::MhloDialect",
-                           "quant::QuantizationDialect",
-                           "shape::ShapeDialect",
-                           "func::FuncDialect",
-                           "sparse_tensor::SparseTensorDialect"];
+  let dependentDialects = [
+    "arith::ArithDialect",
+    "chlo::ChloDialect",
+    "func::FuncDialect",
+    "mhlo::MhloDialect",
+    "quant::QuantizationDialect",
+    "shape::ShapeDialect",
+    "sparse_tensor::SparseTensorDialect",
+    "stablehlo::StablehloDialect"
+  ];
 }
 
 def LegalizeTFCollective : Pass<"xla-legalize-tf-collective", "ModuleOp"> {
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 2c49198be7bad8..1ce45fe7345c11 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow//compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/runtime_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/sparsecore_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/test_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mlprogram_util.h"
@@ -35,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tosa/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/tfl_passes.h"
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h"
-#include "xla/mlir/framework/ir/xla_framework.h"
 #include "xla/mlir/framework/transforms/passes.h"
 #include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -69,6 +70,8 @@ int main(int argc, char **argv) {
   tensorflow::RegisterGraphOptimizationPasses();
   tensorflow::RegisterMlProgramPasses();
   mlir::TFTPU::registerRuntimeLoweringPasses();
+  mlir::TFDevice::registerSparseCorePasses();
+
   tensorflow::tfrt_compiler::RegisterTPULowerClusterToRuntimeOpsPassPipeline();
   tensorflow::tfrt_compiler::
       RegisterNonTPULowerClusterToRuntimeOpsPassPipeline();
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 04cd4282e5c451..cfb2a9b0b86a35 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -1,3 +1,8 @@
+load(
+    "@llvm-project//mlir:tblgen.bzl",
+    "gentbl_cc_library",
+    "td_library",
+)
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -5,13 +10,8 @@ load(
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_py_strict_test", "tf_python_pybind_extension")
-load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
-load(
-    "@llvm-project//mlir:tblgen.bzl",
-    "gentbl_cc_library",
-    "td_library",
-)
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -114,6 +114,7 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
@@ -163,6 +164,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SCFDialect",
diff --git a/tensorflow/compiler/mlir/tfr/build_defs.bzl b/tensorflow/compiler/mlir/tfr/build_defs.bzl
index 090523ce7da3e9..e9dd5e9178080b 100644
--- a/tensorflow/compiler/mlir/tfr/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tfr/build_defs.bzl
@@ -1,8 +1,8 @@
 """BUILD extension for TF composition project."""
 
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 
 def gen_op_libraries(
         name,
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index e2157630ceb1b5..21cdf1203a3554 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -145,6 +145,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
@@ -165,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
@@ -180,6 +182,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -252,7 +255,9 @@ cc_library(
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
@@ -444,6 +449,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index 80257d4812ecd3..68e9624e118453 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -28,6 +28,7 @@ cc_library(
     deps = [
         ":tfrt_fallback_opdefs_inc_gen",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
 
@@ -50,8 +51,12 @@ cc_library(
         ":tfrt_fallback_common",
         ":tfrt_fallback_opdefs",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
@@ -78,6 +83,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@tf_runtime//:basic_kernels_opdefs",
         "@tf_runtime//:core_runtime_opdefs",
         "@tf_runtime//:tensor_opdefs",
@@ -251,6 +257,7 @@ cc_library(
         ":tfrt_fallback_opdefs",
         ":tfrt_gpu_opdefs_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
index 4b2b0576430bd1..ce69fa85189423 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -58,6 +58,7 @@ cc_library(
         ":mlrt_ops_inc_gen",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
     ],
 )
 
@@ -166,6 +167,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_side_effects",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Transforms",
         "@tf_runtime//:compiler_tfrt_op_interfaces",
         "@tf_runtime//:compiler_tfrt_traits",
@@ -183,5 +186,6 @@ cc_library(
         ":tf_mlrt_tpu_ops_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index 7fbc42ad3db93f..6ff38dda69bd85 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -35,6 +35,24 @@ def CreateOp: TensorflowMlrt_Op<"createop", []> {
   let assemblyFormat = "attr-dict";
 }
 
+def ConstOp: TensorflowMlrt_Op<"constop", []> {
+  let summary = "The tf_mlrt ConstOp";
+
+  let description = [{
+    The ConstOp creates a constant tensorflow::Tensor from serialized proto.
+  }];
+
+  let arguments = (ins
+    StrAttr:$tensor_proto
+  );
+
+  let results = (outs
+    TFTensorType:$result
+  );
+
+  let assemblyFormat = "attr-dict";
+}
+
 def ExecuteOp : TensorflowMlrt_Op<"executeop", []> {
   let summary = "The Fallback ExecuteOp";
   let description = [{
@@ -427,7 +445,7 @@ def AsyncWhileOp : TensorflowMlrt_Op<"async_while", [Pure]> {
   }];
 }
 
-def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
+def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", [Pure]> {
   let summary = "Loads a variable tensor as an IFRT array for mlrt";
 
   let description = [{
@@ -458,5 +476,31 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
   );
 }
 
+def IfrtRestoreVariableOp: TensorflowMlrt_Op<"ifrt_restore_variable", []> {
+  let summary = "Restore variable tensors";
+  let description = [{
+    This is the MLRT version of tf.IfrtRestoreVariableOp.
+
+    This Op is similar to a combination of RestoreV2 and AssignVariable Op, but
+    this Op's execution is asynchronous.
+
+    This Op is specific to MLRT runtime and is not a stable interface for
+    serialization.
+
+    This Op will restore the tensors asynchronously and allow the runtime to look
+    for them.
+    The runtime shall handle the possibility that the tensors are not ready when requested
+    because the tensors are loaded asynchronously.
+
+  }];
+
+  let arguments = (ins
+    TFTensorType:$prefix,
+    TFTensorType:$tensor_names,
+    TFTensorType:$shape_and_slices,
+    Variadic<TFTensorType>:$var_handles,
+    TypeArrayAttr: $restored_dtypes
+  );
+}
 
 #endif
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
index cdae75eea036d2..1efb1ac7a16322 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
index c9b64b7b4fb625..cfe04b0689155d 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
index e6d5aec8285a0b..cf14af8f3d35f8 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/hoist_invariant_ops.mlir
@@ -96,6 +96,30 @@ func.func @hoist_const_return(%arg: tensor<i32> {tf_saved_model.index_path = ["i
 
 module attributes {tf_saved_model.semantics} {
 
+// Test not hoisting `tf.BatchFunction`.
+
+// CHECK-LABEL: func @_tfrt_resource_init
+// CHECK: [[const:%.*]] = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> {device = "/CPU:0"} : () -> tensor<1xi32>
+// CHECK: "tf._TfrtSetResource"([[const]]) <{index = 0 : i64}> {device = "/CPU:0"} : (tensor<1xi32>) -> ()
+
+// CHECK-LABEL: func.func private @func_with_batch_function
+func.func private @func_with_batch_function() -> tensor<*xi32> attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "StatefulPartitionedCall:0"}} {
+  // CHECK:  "tf._TfrtGetResource"()
+  %cst = "tf.Const"() <{value = dense<1> : tensor<1xi32>}> {device = "/CPU:0"} : () -> tensor<1xi32>
+  // CHECK:  "tf.BatchFunction"
+  %0 = "tf.BatchFunction"(%cst) <{allowed_batch_sizes = [1], batch_timeout_micros = 5000 : i64, batching_queue = "", container = "", enable_large_batch_splitting = true, f = @_batched, low_priority_allowed_batch_sizes = [], low_priority_batch_timeout_micros = 0 : i64, low_priority_max_batch_size = 0 : i64, low_priority_max_enqueued_batches = 0 : i64, max_batch_size = 1 : i64, max_enqueued_batches = 1 : i64, num_batch_threads = 1 : i64, operandSegmentSizes = array<i32: 1, 0>, shared_name = "batch_function___inference_signature_wrapper_fn_with_defaults_36"}> {device = "/CPU:0"} : (tensor<1xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+func.func private @_batched(%arg0: tensor<1xi32>) -> tensor<1xi32> {
+  return %arg0 : tensor<1xi32>
+}
+
+}
+
+// -----
+
+module attributes {tf_saved_model.semantics} {
+
 // Test hoisting write side-effect ops.
 
 // CHECK-LABEL: func @_tfrt_resource_init
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
index 46f7f52195deca..5052694566de89 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/lower_to_ifrt_restore_variable.mlir
@@ -25,6 +25,32 @@ module {
   }
 }
 
+// -----
+// single variable: VarHandleOp is before RestoreV2
+
+// CHECK-LABEL:   func.func @varhandle_before_restore() {
+// CHECK-NEXT:     [[PREFIX:%.*]] = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+// CHECK-NEXT:     [[SLICE:%.*]] = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[NAME:%.*]] = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+// CHECK-NEXT:     [[HANDLEY:%.*]] = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+// CHECK-NEXT:     "tf.IfrtRestoreVariableOp"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLEY]])
+// CHECK-SAME:        {restored_dtypes = [f32]}
+// CHECK-NOT:       "tf.RestoreV2"
+// CHECK-NEXT:     return
+
+module {
+  func.func @varhandle_before_restore() {
+    %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+    %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+    %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+    "tf.AssignVariableOp"(%1, %0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
+    return
+  }
+}
+
+
 // -----
 // multiple variables
 
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir
new file mode 100644
index 00000000000000..6ff8613283472d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_identity_propagation.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-tfrt-opt %s -tf-identity-propagation -canonicalize | FileCheck %s
+
+// CHECK-LABEL: func @identity
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>)
+func.func @identity(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK-NOT: "tf.Identity"
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[ARG0]]
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @identity_terminator
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>)
+func.func @identity_terminator(%arg0: tensor<i32>) -> (tensor<*xi32>, tensor<i32>) {
+  // CHECK: %[[IDENTITY:.*]] = "tf.Identity"
+  %0 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<*xi32>
+  // CHECK-NOT: "tf.Identity"
+  %1 = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[IDENTITY]], %[[ARG0]]
+  func.return %0, %1 : tensor<*xi32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @xla_sharding
+func.func @xla_sharding(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: %[[OUTPUT:.*]] = "tf.Identity"
+  %0 = "tf.Identity"(%arg0) {_XlaSharding = ""} : (tensor<i32>) -> tensor<i32>
+  // CHECK: return %[[OUTPUT]]
+  func.return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @identity_n
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<i32>, %[[ARG1:.*]]: tensor<f32>)
+func.func @identity_n(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  // CHECK-NOT: "tf.IdentityN"
+  %0:2 = "tf.IdentityN"(%arg0, %arg1) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+  // CHECK: return %[[ARG0]], %[[ARG1]]
+  func.return %0#0, %0#1 : tensor<i32>, tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir
new file mode 100644
index 00000000000000..3055438d5c468d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_pruning.mlir
@@ -0,0 +1,25 @@
+// RUN: tf-tfrt-opt -tf-restore-pruning %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @prune_unused_restore
+func.func @prune_unused_restore() {
+  %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+  %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  // CHECK-NOT: tf.RestoreV2
+  %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+  %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  return
+}
+
+
+// CHECK-LABEL: func.func @used_restore_remains
+func.func @used_restore_remains() {
+  %cst = "tf.Const"() <{value = dense<"restore_ariables"> : tensor<!tf_type.string>}> : () -> tensor<!tf_type.string>
+  %cst_0 = "tf.Const"() <{value = dense<""> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  %cst_1 = "tf.Const"() <{value = dense<"y"> : tensor<1x!tf_type.string>}> : () -> tensor<1x!tf_type.string>
+  // CHECK: tf.RestoreV2
+  %0 = "tf.RestoreV2"(%cst, %cst_1, %cst_0): (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<3x1xf32>
+  %1 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  "tf.AssignVariableOp"(%1, %0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<3x1xf32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
index 8d49d08b1025f8..44fc2c0f6945b4 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss", "tf_cc_test")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
index 90bd835edb2828..1d2b470c8adb91 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
index de2a29c017df30..88bc197c8e88d6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
@@ -29,7 +29,7 @@ func.func @while_body_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>, %z:
 // CHECK-LABEL: func @while_test_if
 // CHECK-SAME: -> !tf_mlrt.tensor
 func.func @while_test_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>) -> (tensor<i32>) {
-  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  // CHECK: [[CONST:%.*]] = tf_mlrt.constop {tensor_proto = "\08\03\12\00"}
   %cst = "tf.Const"() {__op_key = 2: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // Predicate should be inlined.
   // CHECK-NEXT: tf_mlrt.predicate
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index eb2e0587364d6e..3cb879dabe97f7 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -236,7 +236,7 @@ func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
 // CHECK-LABEL: func @while_test()
 // CHECK-SAME: -> !tf_mlrt.tensor
 func.func @while_test() -> (tensor<i32>) {
-  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  // CHECK: [[CONST:%.*]] = tf_mlrt.constop
   %0 = "tf.Const"() {__op_key = 4: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: [[pred_res:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"([[CONST]]) : (!tf_mlrt.tensor) -> i1
   // CHECK: [[while_res:%.*]]:2 = mlrt.while
@@ -353,8 +353,7 @@ func.func @main(%input0: tensor<i32>) -> tensor<i32> {
     {callee = @main_stream_0} :
     (tensor<i32>, !mlrt.promise) -> !mlrt.async_handle
 
-  // CHECK: [[const:%.*]]  = tf_mlrt.executeop
-  // CHECK-SAME: Const
+  // CHECK: [[const:%.*]]  = tf_mlrt.const
   %const = "tf.Const"() {__op_key = 1: i32, value = dense<2> : tensor<i32>} : () -> tensor<i32>
 
   // CHECK: [[b:%.*]] = tf_mlrt.await [[futures]]
@@ -476,3 +475,24 @@ func.func @ifrt_load_variable_test() -> () {
   func.return
 }
 
+// -----
+
+// Test lowering of IfrtRestoreVariableOp
+
+// CHECK-LABEL: func @ifrt_restore_variable_test
+func.func @ifrt_restore_variable_test() -> () {
+  // CHECK-NEXT: [[PREFIX:%.*]] = tf_mlrt.constop
+  %cst = "tf.Const"() {__op_key = 0: i32, value = dense<"restore_ariables"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+  // CHECK-NEXT: [[SLICE:%.*]] = tf_mlrt.constop
+  %cst_0 = "tf.Const"()  {__op_key = 1: i32, value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  // CHECK-NEXT: [[NAME:%.*]] = tf_mlrt.constop
+  %cst_1 = "tf.Const"()  {__op_key = 2: i32, value = dense<["y"]> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  // CHECK-NEXT: [[HANDLE:%.*]] = tf_mlrt.executeop
+  %handle = "tf.VarHandleOp"() {__op_key = 3: i32, container = "x", shared_name = "y"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  // CHECK-NEXT: "tf_mlrt.ifrt_restore_variable"([[PREFIX]], [[NAME]], [[SLICE]], [[HANDLE]]) {restored_dtypes = [f32]}
+  "tf.IfrtRestoreVariableOp"(%cst, %cst_1, %cst_0, %handle) {restored_dtypes = [f32]} : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<!tf_type.resource<tensor<3x1xf32>>>) -> ()
+  // CHECK-NEXT: return
+  func.return
+}
+
+
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
index 60823b2abba41c..70c2235a20a104 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
index 1065a5fc1a682a..bbcc6e963788c9 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tfrt_fallback/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test")
 load("@tf_runtime//tools:mlir_to_bef.bzl", "glob_tfrt_lit_tests", "mlir_to_bef")
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_test")
 # copybara:uncomment load("//third_party/tf_runtime_google/cpp_tests:gen_tests.bzl", "tfrt_cc_test_and_strict_benchmark")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index 8e15b9fcfee8ac..6ef5c011d0a11d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -17,6 +17,7 @@ package_group(
         "//learning/brain/tfrt/cpp_tests/...",
         "//learning/pathways/serving/runtime/...",
         "//learning/pathways/serving/tests/...",
+        "//learning/brain/tfrt/mlir/mlrt/application/pathways/compiler/...",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server/...",
     ]),
@@ -49,14 +50,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_types",
+    srcs = [],
+    hdrs = ["ifrt_types.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tf_ifrt_passes",
     srcs = [
         "lower_to_ifrt_restore_variable.cc",
         "rewrite_cluster_to_ifrt_call.cc",
         "sink_variable_as_named_array.cc",
+        "tf_identity_propagation.cc",
         "tf_ifrt_passes.cc",
         "tf_restore_merging.cc",
+        "tf_restore_pruning.cc",
         "tf_restore_splitting.cc",
     ],
     hdrs = [
@@ -108,6 +122,7 @@ cc_library(
     hdrs = ["tf2hlo.h"],
     deps = [
         ":ifrt_constants",
+        ":ifrt_types",
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
new file mode 100644
index 00000000000000..c64672cdb10e69
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+struct DtypeAndShape {
+  tensorflow::DataType dtype;
+  tensorflow::TensorShape shape;
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_IFRT_IFRT_TYPES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
index 9effab181c1566..7c0fa364b593a7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/lower_to_ifrt_restore_variable.cc
@@ -49,16 +49,15 @@ class LowerToIfrtRestoreVariablePass
   void runOnOperation() override {
     mlir::ModuleOp module = getOperation();
 
-    mlir::WalkResult walk_result =
-        module.walk([&](mlir::TF::RestoreV2Op restore_op) {
-          if (mlir::failed(RewriteRestore(restore_op))) {
-            return mlir::WalkResult::interrupt();
-          }
-          return mlir::WalkResult::advance();
-        });
-
-    if (walk_result.wasInterrupted()) {
-      return signalPassFailure();
+    std::vector<mlir::TF::RestoreV2Op> restore_ops;
+    module.walk([&](mlir::TF::RestoreV2Op restore_op) {
+      restore_ops.push_back(restore_op);
+    });
+
+    for (const auto& restore_op : restore_ops) {
+      if (mlir::failed(RewriteRestore(restore_op))) {
+        return signalPassFailure();
+      }
     }
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
index 20bddd75722c63..7cdc5576ae5465 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
@@ -59,6 +59,20 @@ def LowerToIfrtRestoreVariablePass: Pass<"lower-to-ifrt-restore-variable", "mlir
   let constructor = "CreateLowerToIfrtRestoreVariablePass()";
  }
 
+def TfRestorePruningPass
+    : Pass<"tf-restore-pruning", "mlir::func::FuncOp"> {
+  let summary = "Prune unused`tf.RestoreV2` ops";
+
+  let description = [{
+    This pass prune unused `tf.RestoreV2` op. A typical use case is to combine
+    `TfRestoreSplittingPass`, this pass and `TfRestoreMergingPass` in sequence
+    so that the un-used restored tensors are not read into host memory.
+  }];
+
+  let constructor = "CreateTfRestorePruningPass()";
+}
+
+
 def TfRestoreSplittingPass
     : Pass<"tf-restore-splitting", "mlir::func::FuncOp"> {
   let summary = "Splits `tf.RestoreV2` ops";
@@ -89,4 +103,29 @@ def TfRestoreMergingPass : Pass<"tf-restore-merging", "mlir::func::FuncOp"> {
   }];
 
   let constructor = "CreateTfRestoreMergingPass()";
-}
\ No newline at end of file
+}
+
+def TfIdentityPropagationPass
+    : Pass<"tf-identity-propagation", "mlir::func::FuncOp"> {
+  let summary = "Propagates inputs of no-op identity ops to their outputs";
+
+  let description = [{
+    This pass finds identity ops that are no-op and propagates their inputs
+    directly to outputs so that identity ops can be skipped.
+
+    One example of identity ops that are not no-op is identity ops with XLA
+    sharding annotation. Since some models use identity ops with `_XlaSharding`
+    attributes to change output sharding, this pass doesn't propagate the inputs
+    of such identity ops in order to preserve the sharding changes.
+
+    This pass is useful to make sure that ineffective identity ops don't affect
+    the graph partitioning. For example, in a pipelined model, if there is a CPU
+    identity op between two TPU computation stages (which sometimes happens
+    because TensorFlow inserts it), this will unnecessarily route the
+    intermediate tensors through the CPU device. By forwarding the inputs of the
+    identity op directly to its outputs, we can avoid such inefficiency.
+  }];
+
+  let constructor = "CreateTfIdentityPropagationPass()";
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index 312761a3ba06d7..a0b01ba1ffc3f7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -127,6 +128,7 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
 
   // Create a default device assignment if one is not given by the model.
   if (!metadata.has_device_assignment()) {
+    // TODO(b/316068010): integrate core selection.
     TF_ASSIGN_OR_RETURN(
         auto device_assignment,
         ifrt_client.GetDefaultDeviceAssignment(
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index 74fa271401f547..fec9bbb2c740e7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -31,11 +32,6 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
-struct DtypeAndShape {
-  tensorflow::DataType dtype;
-  tensorflow::TensorShape shape;
-};
-
 struct Tf2HloResult {
   mlir::OwningOpRef<mlir::ModuleOp> mlir_hlo_module;
   tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc
new file mode 100644
index 00000000000000..873838727c9d33
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_identity_propagation.cc
@@ -0,0 +1,88 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFIDENTITYPROPAGATIONPASS
+#define GEN_PASS_DECL_TFIDENTITYPROPAGATIONPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+constexpr absl::string_view kXlaShardingAttr = "_XlaSharding";
+
+bool IsTerminator(mlir::Operation* op) {
+  return op->hasTrait<mlir::OpTrait::IsTerminator>();
+}
+
+class TfIdentityPropagationPass
+    : public impl::TfIdentityPropagationPassBase<TfIdentityPropagationPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+
+    func.walk([](mlir::TF::IdentityOp identity) {
+      // Don't propagate inputs of identity ops with sharding annotation since
+      // identity ops are sometimes used to change output sharding.
+      if (identity->hasAttr(kXlaShardingAttr)) {
+        return;
+      }
+      // Identity outputs to terminator ops (e.g., `func.return`) cannot be
+      // replaced unless input/output types are exactly the same. Doing so may
+      // cause mismatch between the enclosing region's return type and the
+      // terminator's arg type.
+      const bool same_type =
+          identity.getInput().getType() == identity.getOutput().getType();
+      identity.getOutput().replaceUsesWithIf(
+          identity.getInput(), [&](mlir::OpOperand& operand) {
+            return same_type || !IsTerminator(operand.getOwner());
+          });
+    });
+
+    func.walk([](mlir::TF::IdentityNOp identity_n) {
+      if (identity_n->hasAttr(kXlaShardingAttr)) {
+        return;
+      }
+      for (auto [input, output] :
+           llvm::zip(identity_n.getInput(), identity_n.getOutput())) {
+        const bool same_type = input.getType() == output.getType();
+        output.replaceUsesWithIf(input, [&](mlir::OpOperand& operand) {
+          return same_type || !IsTerminator(operand.getOwner());
+        });
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfIdentityPropagationPass() {
+  return std::make_unique<TfIdentityPropagationPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index 53bd55cc0d2799..9737c681d28aa8 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -70,6 +70,14 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
 
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfIdentityPropagationPass());
+
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreSplittingPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestorePruningPass());
+  pm.addNestedPass<mlir::func::FuncOp>(CreateTfRestoreMergingPass());
+
+  pm.addPass(CreateLowerToIfrtRestoreVariablePass());
+
   pm.addPass(CreateRewriteClusterToIfrtCallPass());
 
   // Sink VarHandle with ReadVariableOp: subsequent SinkVariableAsNamedArrayPass
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
index 3835a77f04f93c..93713fbdc13646 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
@@ -44,6 +44,14 @@ CreateTfRestoreSplittingPass();
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 CreateTfRestoreMergingPass();
 
+// Creates a pass that propagates inputs of no-op identity ops to their outputs.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfIdentityPropagationPass();
+
+// Creates a pass that prunes unused `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestorePruningPass();
+
 // Creates a pass that lower `tf.RestoreVariableOp` to
 // `tf.IfrtRestoreVariableOp`.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc
new file mode 100644
index 00000000000000..6491be3f7151fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_pruning.cc
@@ -0,0 +1,52 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFRESTOREPRUNINGPASS
+#define GEN_PASS_DECL_TFRESTOREPRUNINGPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+// Prune unused RestoreV2 Op.
+class TfRestorePruningPass
+    : public impl::TfRestorePruningPassBase<TfRestorePruningPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    func.walk([&](mlir::TF::RestoreV2Op restore) {
+      if (restore.use_empty()) {
+        restore.erase();
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestorePruningPass() {
+  return std::make_unique<TfRestorePruningPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 085d77df441e2e..17e3d8be95204d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -115,6 +115,10 @@ bool CanHoist(const llvm::DenseSet<mlir::TF::ResourceHandle> &read_only_vars,
   // return ops should not be hoisted.
   if (op->mightHaveTrait<mlir::OpTrait::IsTerminator>()) return false;
 
+  // Fixes a corner case where hoisting the tf.BatchFunction leads to
+  // a compilation error; such a case may occur in unit tests.
+  if (llvm::isa<mlir::TF::BatchFunctionOp>(op)) return false;
+
   // Non-side-effecting ops can be hoisted.
   if (mlir::isMemoryEffectFree(op)) return true;
 
@@ -402,7 +406,7 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
     } else if (auto func = llvm::dyn_cast<mlir::func::FuncOp>(op)) {
       if (!IsSessionInitializer(func)) return;
       FindCalleesRecursive(symbol_table, func, init_callees);
-    } else if (op->getName().getStringRef().str() == "tf.XlaLaunch") {
+    } else if (llvm::isa<mlir::TF::XlaLaunchOp>(op)) {
       // TODO(b/275095412): Clean up MLIR XLA functions after they are written
       // back to function library, so that we don't need to do special handling
       // for those functions here.
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index 1bb99fa64ebaf7..7d28571db5030a 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -65,6 +65,7 @@ cc_library(
         ":tpu_conversion_patterns",
         ":util",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
         "//tensorflow/compiler/mlir/tensorflow/ir/host_runtime:tensorflow_tfrt_ops_inc_gen",
         "//tensorflow/compiler/mlir/tfrt:constants",
@@ -78,12 +79,14 @@ cc_library(
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
         "@com_google_protobuf//:protobuf_headers",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -220,6 +223,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index 8271a5c796e5c4..37ddf0b1bf076d 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "google/protobuf/text_format.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
@@ -52,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace mlrt_compiler {
@@ -343,6 +346,26 @@ class IfrtLoadVariableOpConversion
   }
 };
 
+// Convert tf.IfrtRestoreVariableOp to tf_mlrt.IfrtRestoreVariableOp
+class IfrtRestoreVariableOpConversion
+    : public mlir::OpConversionPattern<mlir::TF::IfrtRestoreVariableOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::IfrtRestoreVariableOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    auto new_op = rewriter.create<tf_mlrt::IfrtRestoreVariableOp>(
+        op.getLoc(), adaptor.getOperands()[0], adaptor.getOperands()[1],
+        adaptor.getOperands()[2],
+        adaptor.getOperands().slice(3, adaptor.getOperands().size() - 3),
+        op.getRestoredDtypes());
+    rewriter.replaceOp(op, new_op);
+
+    return mlir::success();
+  }
+};
+
 std::optional<std::string> DecodeLongName(mlir::Location loc) {
   if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
     return name_loc.getName().str();
@@ -422,6 +445,18 @@ class ExecuteOpConversion final : public mlir::ConversionPattern {
     // TODO(b/173017701): Avoid fallback for ops within XLA GPU clusters.
     if (!UseFallback(op)) return mlir::failure();
 
+    if (auto const_op = llvm::dyn_cast<mlir::TF::ConstOp>(op)) {
+      tensorflow::TensorProto tensor_proto;
+      auto status = ConvertToTensorProto(const_op.getValue(), &tensor_proto);
+      if (!status.ok())
+        return const_op.emitError(tsl::NullTerminatedMessage(status));
+
+      rewriter.replaceOpWithNewOp<tf_mlrt::ConstOp>(
+          op, rewriter.getType<tf_mlrt::TFTensorType>(),
+          tensor_proto.SerializeAsString());
+      return mlir::success();
+    }
+
     // The assign_op_key pass should have ran.
     if (!op->hasAttr(tensorflow::tfrt_compiler::kOpKeyAttrName))
       return op->emitError("does not have op_key defined");
@@ -1189,7 +1224,8 @@ class TfToMlrtConversionPass
     patterns.add<WhileOpConversion>(&context, &type_converter_, &symbol_table);
     patterns.add<AsyncOpConversion, GetResourceOpConversion,
                  SetResourceOpConversion, IfrtLoadVariableOpConversion,
-                 TFAwaitOpConversion, TFPromiseOpConversion>(&context);
+                 IfrtRestoreVariableOpConversion, TFAwaitOpConversion,
+                 TFPromiseOpConversion>(&context);
     patterns.add<BatchFunctionOpConversion, CaseOpConversion, CondOpConversion,
                  TFAsyncWhileOpConversion, TFMapFnOpConversion>(type_converter_,
                                                                 &context);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
index d9e1b7f73ac0c8..a1f9d401f5c485 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
@@ -35,9 +35,9 @@ bool UseFallback(mlir::Operation *op) {
   return !llvm::isa<
       mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
       mlir::TF::BatchFunctionOp, mlir::TF::CaseOp, mlir::TF::IfrtLoadVariableOp,
-      mlir::TF::StatefulPartitionedCallOp, mlir::TF::PartitionedCallOp,
-      mlir::TF::LegacyCallOp, mlir::TF::IfOp, mlir::TF::WhileOp,
-      mlir::TF::TPUCompileMlirAndExecuteOp>(op);
+      mlir::TF::IfrtRestoreVariableOp, mlir::TF::StatefulPartitionedCallOp,
+      mlir::TF::PartitionedCallOp, mlir::TF::LegacyCallOp, mlir::TF::IfOp,
+      mlir::TF::WhileOp, mlir::TF::TPUCompileMlirAndExecuteOp>(op);
 }
 
 }  // namespace mlrt_compiler
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 66aee10db7e050..f61a087e782704 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -147,35 +147,6 @@ StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(
 
 }  // namespace
 
-Status ConvertFunctionToBef(
-    mlir::StringRef function_name, const tensorflow::FunctionBody* fbody,
-    const FunctionLibraryDefinition& flib_def,
-    tfrt::ArrayRef<tfrt::string_view> devices,
-    const tensorflow::TfrtFunctionCompileOptions& options,
-    tfrt::BefBuffer* bef_buffer) {
-  mlir::MLIRContext context;
-  // FunctionDef -> TF Dialect
-  auto expected_module =
-      tensorflow::ConvertFunctionToMlir(fbody, flib_def, &context);
-
-  if (!expected_module.ok())
-    return absl::InternalError(absl::StrCat(
-        "Failed to convert function to mlir for function ", function_name.str(),
-        ". Error: ", expected_module.status().message()));
-
-  auto module = std::move(expected_module).value();
-
-  // Attach devices to the MLIR module.
-  if (!devices.empty()) {
-    mlir::Builder builder(module->getContext());
-    module->getOperation()->setAttr("tf.devices",
-                                    builder.getStrArrayAttr(devices));
-  }
-
-  // TF Dialect -> BEF
-  return tensorflow::CompileTFMLIRToBEF(options, module.get(), bef_buffer);
-}
-
 Status ConvertTfMlirToRuntimeExecutable(
     const TfrtCompileOptions& options, mlir::ModuleOp module,
     absl::FunctionRef<Status(mlir::PassManager&, mlir::ModuleOp,
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.h b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
index 09917ea61095fe..c8aece1e8f4706 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.h
@@ -37,17 +37,6 @@ namespace tensorflow {
 
 struct FunctionBody;
 
-// Converts FunctionDef to TFRT's Binary Executable Format. This is the entry
-// point of tf.function to TFRT. function_name and device_name are given from
-// the Python context. The lowered BEF will be stored in an external buffer
-// pointed by bef_buffer.
-Status ConvertFunctionToBef(
-    mlir::StringRef function_name, const tensorflow::FunctionBody* fbody,
-    const FunctionLibraryDefinition& flib_def,
-    tfrt::ArrayRef<tfrt::string_view> devices,
-    const tensorflow::TfrtFunctionCompileOptions& options,
-    tfrt::BefBuffer* bef_buffer);
-
 // Converts an MLIR `module` in TF dialect to TFRT's Binary Executable Format.
 // If `fallback_state` is not null, the MLIR functions for XLA clusters in
 // the form of XlaLaunch will be exported and added to the function library when
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index d73899b6f85ecb..86e2e269e4d329 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,12 +1,20 @@
 load(
-    "//tensorflow:tensorflow.bzl",
-    "check_deps",
-    "tf_cc_binary",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
 )
 load(
     "@local_xla//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
+load(
+    "//tensorflow:tensorflow.bzl",
+    "check_deps",
+    "tf_cc_binary",
+)
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -19,14 +27,6 @@ load(
     "if_llvm_system_z_available",
     "if_llvm_x86_available",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -65,6 +65,7 @@ cc_library(
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
@@ -88,6 +89,7 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:all_passes",  # fixdeps: keep
@@ -125,6 +127,7 @@ tf_cc_binary(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:MemRefTransforms",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index acec5d7ae27ff5..42d679c35d0173 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -86,6 +86,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:AllocationOpInterface",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index 18c5ab830d4722..c4abb6420d9b38 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -1,5 +1,4 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -8,6 +7,7 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -39,6 +39,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -57,6 +58,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
     ],
@@ -73,6 +75,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -148,6 +151,7 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
@@ -213,8 +217,10 @@ cc_library(
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:ShapeTransforms",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_xla//xla/mlir_hlo:transforms_passes",
     ],
diff --git a/tensorflow/compiler/mlir/tosa/BUILD b/tensorflow/compiler/mlir/tosa/BUILD
index e25d2229c605c8..a7d9610a472308 100644
--- a/tensorflow/compiler/mlir/tosa/BUILD
+++ b/tensorflow/compiler/mlir/tosa/BUILD
@@ -102,6 +102,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/framework/fixedpoint",
     ],
@@ -157,6 +158,7 @@ cc_library(
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -219,6 +221,7 @@ cc_library(
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -248,6 +251,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TosaDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 498c9bfe11bbae..d255b67ccff83f 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_cuda_cc_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+load("//tensorflow/compiler/tests:build_defs.bzl", "generate_backend_suites", "tf_xla_py_strict_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -260,6 +260,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     shard_count = 2,
@@ -679,6 +680,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     tags = [
@@ -925,6 +927,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     shard_count = 10,
@@ -1517,6 +1520,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1554,6 +1558,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1591,6 +1596,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     # TODO(b/232442915): Enable MLIR.
     enable_mlir_bridge = False,
@@ -2343,6 +2349,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -2378,6 +2385,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     python_version = "PY3",
@@ -2605,6 +2613,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     tags = [
         "no_pip",
@@ -2633,6 +2642,7 @@ tf_xla_py_strict_test(
         "cpu_ondemand",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     main = "where_op_test.py",
@@ -2719,6 +2729,7 @@ tf_xla_py_strict_test(
         "cpu_ondemand",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = False,
     python_version = "PY3",
@@ -2883,6 +2894,7 @@ tf_xla_py_strict_test(
         "cpu",
         "gpu",
         "gpu_a100",
+        "gpu_h100",
     ],
     python_version = "PY3",
     tags = [
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 7343bb9b89efce..ce6b626683e281 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -1,7 +1,7 @@
 """Build rules for Tensorflow/XLA testing."""
 
-load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -84,7 +84,7 @@ def tf_xla_py_test(
                 "--test_device=" + cpu_xla_device,
                 "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128",
             ]
-        elif backend in ("gpu", "gpu_a100"):
+        elif backend in ("gpu", "gpu_a100", "gpu_h100"):
             backend_args += [
                 "--test_device=" + gpu_xla_device,
                 "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128,DT_BFLOAT16",
@@ -125,7 +125,7 @@ def tf_xla_py_test(
                 #
                 # This is for testing book keeping because the bridge does not have any gpu specific
                 # logic at this time, so CPU testing is good enough and cheaper.
-                extra_tag = ["ondemand"] if backend in ("gpu", "gpu_a100") else []
+                extra_tag = ["ondemand"] if backend in ("gpu", "gpu_a100", "gpu_h100") else []
             elif has_mlir_dep:
                 # Some tests run only with mlir_bridge by explicitly adding the MLIR
                 # bridge dep so if the dep is already present skip non MLIR
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 91ef722b52db86..498aa0f91e487a 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -3,6 +3,11 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "cuda_rpath_flags",
+)
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Placeholder: load py_proto_library
@@ -28,11 +33,6 @@ load(
     "if_static",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "cuda_rpath_flags",
-)
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 7e1d80e9e8676d..01e85cc7c6cfc7 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,24 +1,25 @@
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
 # copybara:uncomment_begin(google-only)
 # load("//devtools/deps/check:deps_check.bzl", "check_dependencies")
 # copybara:uncomment_end
 
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load("@local_xla//xla:xla.bzl", "xla_py_proto_library")
+load("@local_xla//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu", "tf_cc_binary", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_gen_op_wrapper_py", "tf_openmp_copts")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "filegroup", "get_compatible_with_portable")
-load("@local_xla//xla:xla.bzl", "xla_py_proto_library")
-load("@local_xla//xla/service/cpu:build_defs.bzl", "runtime_copts")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_tensor_coding_deps",
     "tf_proto_library",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -302,6 +303,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@ducc//:fft_wrapper",
         "@eigen_archive//:eigen3",
+        "@llvm-project//mlir:TransformUtils",
         "@local_xla//xla:empty",
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/platform:bfloat16",
@@ -418,6 +420,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/types:span",
         "@local_xla//xla:cpu_function_runtime",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:statusor",
@@ -1169,11 +1172,11 @@ cc_library(
     hdrs = ["mlir_bridge_pass.h"],
     visibility = [":internal"],
     deps = [
-        ":tf2xla_defs",
         ":xla_op_registry",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index ccfd67e223b1b3..7ead605ca65c07 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_gen_op_wrapper_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index f06596bfe6530b..6a60149d7cc4a1 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
@@ -11,7 +12,6 @@ load(
     "tf_proto_library",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index 6ab1793d493eaf..5c19b9fe1014d3 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -160,7 +160,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
     }
   }
 
-  if (platform_index < 0) return tsl::OkStatus();
+  if (platform_index < 0) return absl::OkStatus();
   VLOG(3) << "XlaCallModule setting the platform_index to " << platform_index
           << " for platform " << compilation_platform << ".";
   mlir::Block &main_body = main_.front();
@@ -193,7 +193,7 @@ absl::Status XlaCallModuleLoader::SetPlatformIndex(
 
   main_.eraseArgument(0);
   platform_index_arg_set_ = true;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 static mlir::stablehlo::CustomCallOp MakeShapeRefinementOperandWrapper(
@@ -232,13 +232,13 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
         VLOG(3) << "XlaCallModule skipping shape refinement due to module "
                 << " attribute " << kUsesShapePolymorphismAttr.str() << "="
                 << mlir::debugString(uses_shape_poly_attr);
-        return tsl::OkStatus();
+        return absl::OkStatus();
       }
     } else {
       VLOG(3) << "XlaCallModule skipping shape refinement due to module "
               << " attribute " << kUsesShapePolymorphismAttr.str()
               << " missing";
-      return tsl::OkStatus();
+      return absl::OkStatus();
     }
   }
   // Add the tokens to the input_shapes. Starting with version 9, the main
@@ -430,7 +430,7 @@ absl::Status XlaCallModuleLoader::RefineDynamicShapes(
     DumpMlirOpToFile("xla_call_module.after_shape_refinement", *module_);
   }
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::LoadModule(
@@ -527,7 +527,7 @@ absl::Status XlaCallModuleLoader::LoadModule(
         " arguments of which ", nr_platform_args, " platform index arguments, ",
         "and ", nr_token_arguments, " token arguments."));
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::ValidateDialect() {
@@ -550,7 +550,7 @@ absl::Status XlaCallModuleLoader::ValidateDialect() {
         absl::StrCat("Module has unsupported dialects: ",
                      diag_handler.ConsumeStatus().ToString()));
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status XlaCallModuleLoader::ValidateStaticShapes() {
@@ -563,8 +563,8 @@ absl::Status XlaCallModuleLoader::LowerModuleToMhlo() {
   mlir::PassManager pm(module_->getContext());
   applyTensorflowAndCLOptions(pm);
   pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass(
-      /*legalizeBroadcasts=*/true, /*expandCompositions=*/true));
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createChloLegalizeToHloPass());
   pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
   // In order to export to XLA, we must sink constants to control flow
   // regions, since XLA uses functional control flow.
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 0402508fe92f56..c24654c894b34f 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -29,13 +29,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/lower_cluster_to_runtime_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+// #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device.h"
@@ -162,16 +163,28 @@ MlirOptimizationPassState GetPassStateImpl(
               << " Bridge, disabled by user. "
                  "The fallback will evaluate.";
       metrics::UpdateTfMlirBridgeFirstPhaseCounter(
-          is_supported_by_replicated_brige ? "tpu" : "cpu/gpu", "v2", true,
-          "disabled_by_user");
+          /*bridge_type*/ is_supported_by_replicated_brige
+              ? mlir::TF::kMlirPh1BridgeCounterReplicated
+              : mlir::TF::kMlirPh1BridgeCounterNonReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV2,
+          /*device_type*/
+          is_supported_by_replicated_brige
+              ? mlir::TF::kMlirPh1BridgeCounterTpu
+              : mlir::TF::kMlirPh1BridgeCounterNonTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
     }
     case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
       // Graph analysis only runs on TPU graph.
       VLOG(1) << "Skipping MLIR TPU Bridge, disabled because the "
                  "graph has unsupported features. The fallback will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
-                                                   "invalid_graph");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV2,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "invalid_graph");
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
       // args.
@@ -305,16 +318,24 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
       VLOG(1) << "Skipping MLIR Replicated Bridge V1 Compat, MLIR Replicated "
                  "bridge disabled "
                  "by user. Fallback will evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v1", true,
-                                                   "disabled_by_user");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV1,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
     case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
       VLOG(1) << "Skipping MLIR Replicated Bridge V1 Compat, MLIR Replicated "
                  "bridge disabled "
                  "because graph has unsupported features. Old bridge will "
                  "evaluate.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v1", true,
-                                                   "invalid_graph");
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          /*bridge_type*/ mlir::TF::kMlirPh1BridgeCounterReplicated,
+          /*bridge_version*/ mlir::TF::kMlirPh1BridgeCounterV1,
+          /*device_type*/ mlir::TF::kMlirPh1BridgeCounterTpu,
+          /*fallback_enabled*/ true,
+          /*result*/ "invalid_graph");
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
       // args.
diff --git a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
index 3c453e88c9dc10..b2e52f6d0dbda5 100644
--- a/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/mlir_xla_op_kernel.cc
@@ -137,12 +137,24 @@ Status MlirXlaOpKernel::ConstructXlaOp(XlaOpKernelContext* ctx) {
   // Compile the graph to HLO.
   GraphDebugInfo debug_info;
   std::vector<xla::XlaOp> returns(1);
-  TF_RETURN_IF_ERROR(BuildHloFromGraph(
-      *graph, *ctx->builder(), *ctx_res->GetContext(), xla_params, returns,
-      mlir::SpanToArrayRef<XlaCompiler::Argument>(xla_args), control_rets,
-      device->device_type(),
-      *ctx->function_library()->GetFunctionLibraryDefinition(), debug_info,
-      {}));
+  auto build_hlo = [&](bool unconditionally_use_output_shapes) {
+    return BuildHloFromGraph(
+        *graph, *ctx->builder(), *ctx_res->GetContext(), xla_params, returns,
+        unconditionally_use_output_shapes,
+        mlir::SpanToArrayRef<XlaCompiler::Argument>(xla_args), control_rets,
+        device->device_type(),
+        *ctx->function_library()->GetFunctionLibraryDefinition(), debug_info,
+        {});
+  };
+
+  // Some of the operations that come through here do not know how to set their
+  // own output shapes (e.g. _XlaHostComputeMlir') so we may need to use the
+  // unconditional output shapes option. However, many graphs fail if we do it
+  // unconditionally so try both.
+  if (!build_hlo(/*unconditionally_use_output_shapes=*/false).ok()) {
+    // If that failed, then try again with the unconditional set true
+    TF_RETURN_IF_ERROR(build_hlo(/*unconditionally_use_output_shapes=*/true));
+  }
 
   // Set context outputs.
   for (int i = 0, end = returns.size(); i < end; ++i) {
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index ba26c4fe54b31a..6adab4c6c7f6b4 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -1,10 +1,10 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 128d865b8c63cf..50dbf000b03501 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,8 +1,8 @@
+load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_py_clif_cc",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 8001c6dc47e18e..8af2c21994d4c4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -752,71 +752,25 @@ Status XlaCompiler::CompileSingleOp(
     const XlaCompiler::CompileOptions& compile_options,
     const XlaCompiler::SingleOpCompileArgument& single_op_compile_argument,
     absl::Span<const Argument> args, XlaCompiler::CompilationResult* result) {
-  const std::vector<DataType>& result_dtypes =
-      single_op_compile_argument.output_dtypes;
   const NodeDef& node_def = single_op_compile_argument.node_def;
   TF_ASSIGN_OR_RETURN(
       auto graph,
       CreateSingleOpGraph(node_def, args,
                           single_op_compile_argument.output_dtypes));
 
-  auto compile_with_old_bridge = [&]() {
-    *result = {};
-    Status status = ADD_SOURCE_LOCATION(CompileGraph(
-        compile_options, node_def.name(), std::move(graph), args, result));
-    if (status.ok()) {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileSingleOpXlaBuilderSuccess);
-    } else {
-      tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-          tensorflow::metrics::Phase2XlaCompilerMetric::
-              kCompileSingleOpXlaBuilderFailure);
-    }
-    return status;
-  };
-
-  const ConfigProto* config = &(single_op_compile_argument.config_proto);
-  auto bridge_rollout = GetMlirBridgeRolloutState(
-      config ? std::optional<ConfigProto>(*config) : std::nullopt);
-  if (bridge_rollout ==
-          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED ||
-      node_def.op() == "VarIsInitializedOp" ||
-      (bridge_rollout !=
-           ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED &&
-       options_.device_type.type_string() != DEVICE_TPU_XLA_JIT)) {
-    return compile_with_old_bridge();
-  }
-
-  GraphDebugInfo debug_info;
-  std::vector<std::string> control_rets;
-  if (result_dtypes.empty()) {
-    control_rets.push_back(node_def.name());
-  }
-
-  bool mlir_enabled = (bridge_rollout ==
-                       ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED);
-  VLOG(1) << "Attempting MLIR bridge."
-          << (mlir_enabled ? " MLIR is explicitly enabled." : "");
-  auto mlir_result = CompileGraphToXlaHlo(
-      *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args), control_rets,
-      options_.device_type.type_string(), compile_options.use_tuple_arg,
-      /*analyse_graph=*/!mlir_enabled, *options_.flib_def, debug_info,
-      options_.shape_determination_fns, result);
-
-  if (mlir_result.ok() || mlir_enabled) {
+  *result = {};
+  Status status = ADD_SOURCE_LOCATION(CompileGraph(
+      compile_options, node_def.name(), std::move(graph), args, result));
+  if (status.ok()) {
+    tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
+        tensorflow::metrics::Phase2XlaCompilerMetric::
+            kCompileSingleOpXlaBuilderSuccess);
+  } else {
     tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
         tensorflow::metrics::Phase2XlaCompilerMetric::
-            kCompileSingleOpMlirSuccess);
-    return mlir_result;
+            kCompileSingleOpXlaBuilderFailure);
   }
-  tensorflow::metrics::IncrementPhase2XlaCompilerCounter(
-      tensorflow::metrics::Phase2XlaCompilerMetric::
-          kCompileSingleOpMlirFailure);
-  VLOG(1) << "Failed second phase of the MLIR bridge. Will "
-             "retry with the old bridge. MLIR bridge compilation status: "
-          << mlir_result;
-  return compile_with_old_bridge();
+  return status;
 }
 
 Status XlaCompiler::CompileFunction(
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 784c012a0274bc..754d018cc5781c 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
@@ -50,6 +51,18 @@ absl::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
+// Returns the number of results.
+int CountResults(
+    absl::Span<const xla::cpu_function_runtime::BufferInfo> buffer_infos) {
+  int num_results = 0;
+  for (const auto& info : buffer_infos) {
+    if (info.is_result_parameter()) {
+      ++num_results;
+    }
+  }
+  return num_results;
+}
+
 // Collect names from `entries`, where T is one of
 // tf2xla::{Feed,Fetch,Variable}. We hold the actual strings in nonempty_names,
 // and hold arrays of pointers in name_ptrs, terminated by a nullptr entry.
@@ -146,6 +159,7 @@ XlaJitCompiledCpuFunction::Compile(
       xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
                       ComputeResultIndex(buffer_assignment));
+  const int num_results = CountResults(buffer_infos);
 
   std::unique_ptr<XlaJitCompiledCpuFunction> jit_unique_ptr(
       new XlaJitCompiledCpuFunction);
@@ -173,6 +187,8 @@ XlaJitCompiledCpuFunction::Compile(
       &jit->static_data_, jit->arg_index_table_.size());
   XlaCompiledCpuFunction::set_static_data_num_variables(&jit->static_data_,
                                                         config.variable_size());
+  XlaCompiledCpuFunction::set_static_data_num_results(&jit->static_data_,
+                                                      num_results);
   XlaCompiledCpuFunction::set_static_data_result_index(&jit->static_data_,
                                                        result_index);
   // Optional metadata is collected and set below.
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 399826ac12ed55..787d67674a2c8e 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -176,6 +176,8 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
   XlaCompiledCpuFunction function(jit->StaticData());
+  ASSERT_EQ(function.num_args(), 2);
+  ASSERT_EQ(function.num_results(), 1);
 
   // Run the function and check results.
   *static_cast<int32*>(function.arg_data(0)) = 10;
@@ -258,6 +260,8 @@ TEST(XlaJitCompiledCpuFunction, SumVariable) {
       XlaJitCompiledCpuFunction::Compile(graph_def, config,
                                          xla::ExecutableBuildOptions()));
   XlaCompiledCpuFunction function(jit->StaticData());
+  ASSERT_EQ(function.num_args(), 2);
+  ASSERT_EQ(function.num_results(), 2);
 
   // Run the function and check results.
   *static_cast<int32*>(function.arg_data(0)) = 10;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index de77413927f52e..50a5319db026b2 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -63,6 +63,11 @@
 
 # Placeholder: load py_proto_library
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
@@ -83,10 +88,6 @@ load(
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tensorflow_opensource_extra_deps", "tf_monitoring_framework_deps", "tf_selective_registration_deps")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 
 # For platform specific build config
 load(
@@ -117,7 +118,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -1035,7 +1035,6 @@ cc_library(
         "//tensorflow/core:mobile_additional_lib_deps",
         "//tensorflow/core/platform:resource",
         "//tensorflow/core/util:stats_calculator_portable",
-        "@local_xla//xla:bazel_issue_21519",
     ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
@@ -1717,8 +1716,8 @@ tf_cuda_library(
         "@local_tsl//tsl/framework:cancellation",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/util:command_line_flags",
-        "@local_tsl//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:command_line_flags",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]) + if_static(
diff --git a/tensorflow/core/activity_watcher/BUILD b/tensorflow/core/activity_watcher/BUILD
index 0526bccd5a1672..d471f4f892c4fc 100644
--- a/tensorflow/core/activity_watcher/BUILD
+++ b/tensorflow/core/activity_watcher/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_mobile")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 8d96a2cbaff5f8..76b8cc01324619 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -7,19 +7,19 @@
 #   :java_api_def
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_binary",
-    "tf_cc_test",
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
 )
 load(
     "@local_xla//xla/tsl/mkl:build_defs.bzl",
     "if_mkl",
 )
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
-    "@local_config_tensorrt//:build_defs.bzl",
-    "if_tensorrt",
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt
new file mode 100644
index 00000000000000..f8066663fa20c3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataSizeV2.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "ComputeDedupDataSizeV2"
+  visibility: HIDDEN
+  out_arg {
+    name: "num_elements"
+    description: <<END
+The size of the deduplicated data from infeed.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op computes the size of the deduplication data from embedding core and returns the updated config."
+  description: <<END
+This op is to compute size of the deduplication data so to provide this
+information to the op that computes the tuple mask of deduplication data can
+have static output shape.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt
new file mode 100644
index 00000000000000..a8c9e86ca16c82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeDedupDataTupleMaskV2.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "ComputeDedupDataTupleMaskV2"
+  visibility: HIDDEN
+  out_arg {
+    name: "output_shape"
+    description: <<END
+A 2-D int tensor represent mask of deduplication data tuple generated by
+`XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+represents float tensor. The second dimension of `output_shape` gives length of
+each tuple element.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op computes tuple mask of deduplication data from embedding core."
+  description: <<END
+The deduplication data receiving from embedding core is a Tensor with
+type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+rank 1 tensors. This op is to represents types and length of these elements.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt
new file mode 100644
index 00000000000000..825e07973a4728
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FinalizeTPUEmbeddingV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "FinalizeTPUEmbeddingV2"
+  in_arg {
+    name: "common_config"
+    description: <<END
+A string-encoded common configuration proto containing metadata
+about the TPUEmbedding partitioner output and the HBM size (in bytes) required
+for operation.
+END
+  }
+  in_arg {
+    name: "memory_config"
+    description: <<END
+A string-encoded memory config proto containing metadata about
+the memory allocations reserved for TPUEmbedding.
+END
+  }
+  out_arg {
+    name: "embedding_partitions"
+    description: <<END
+A string-encoded embedding partitions proto describing how embedding tables are
+partitioned along their feature and ID.
+END
+  }
+  out_arg {
+    name: "hbm_buffers_config"
+    description: <<END
+A string-encoded HBM buffers config proto specifies where HBM buffers are
+located.
+END
+  }
+  summary: "An op that finalizes the TPUEmbedding configuration."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt
new file mode 100644
index 00000000000000..fe7d4d629e541f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -0,0 +1,61 @@
+op {
+  graph_op_name: "XlaRecvTPUEmbeddingActivationsV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "deduplication_data"
+    description: <<END
+A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+A TensorList of embedding activations containing one Tensor per
+embedding table in the model.
+END
+  }
+  attr {
+    name: "num_tables"
+    description: <<END
+The number of output activation tensors. If feature descriptor is
+present in the tpu embedding config, it is equal to the number of features
+otherwise equal to number of embedding tables in the model.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op that receives embedding activations on the TPU."
+  description: <<END
+The TPU system performs the embedding lookups and aggregations. The results of
+these aggregations are visible to the Tensorflow Graph as the outputs of a
+XlaRecvTPUEmbeddingActivations Op. This op returns a list containing one
+Tensor of activations per table specified in the model.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
new file mode 100644
index 00000000000000..00c4a44a0af01f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  visibility: HIDDEN
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "Receives deduplication data (indices and weights) from the embedding core."
+  description: <<END
+The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
+XLA nested tuple containing N elements (where N is the ratio of the number of
+embedding to tensor cores per TPU chip). Each element of the nested tuple is a
+tuple of rank 1 tensors. Each tensor either contains indices (DT_UINT32) for
+embedding lookup on the TensorCore or weights (DT_FLOAT) to apply to the output
+of the embedding lookup operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt
new file mode 100644
index 00000000000000..dea9b98e80ab5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "XlaSendTPUEmbeddingGradientsV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+A TensorList of gradients with which to update embedding tables.
+END
+  }
+  in_arg {
+    name: "learning_rates"
+    description: <<END
+A TensorList of learning rates used for updating the embedding
+tables via the optimizer. The length of the TensorList must be equal to the
+number of dynamic learning rate tags specified in the
+TPUEmbeddingConfiguration proto.
+END
+  }
+  in_arg {
+    name: "deduplication_data"
+    description: <<END
+A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+END
+  }
+  attr {
+    name: "NumTables"
+    description: <<END
+number of tables
+END
+  }
+  attr {
+    name: "NumLearningRateTags"
+    description: <<END
+number of learning rate tags
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  attr {
+    name: "embedding_partitions"
+    description: <<END
+Serialized EmbeddingPartitionsProto proto.
+END
+  }
+  attr {
+    name: "hbm_buffers_config"
+    description: <<END
+Serialized HbmBuffersConfig proto.
+END
+  }
+  attr {
+    name: "tpu_topology"
+    description: <<END
+Serialized TpuTopologyArgsProto proto.
+END
+  }
+  summary: "An op that performs gradient updates of embedding tables."
+  description: <<END
+The gradients argument is a TensorList having the same length and shapes as the
+return value of XlaRecvTPUEmbeddingActivations, but contains gradients of the
+model's loss with respect to the embedding activations. The embedding tables are
+updated from these gradients via the optimizer specified in the
+TPUEmbeddingConfiguration proto given to tpu.initialize_system.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt b/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
index c7deacb4fec21d..a31cfa5c4d85ec 100644
--- a/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSplitND.pbtxt
@@ -5,6 +5,7 @@ op {
     name: "input"
     description: <<END
 Input tensor to split across all dimensions.
+END
   }
   out_arg {
     name: "outputs"
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 676d2b55b76694..1f358ae1f28c29 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,4 +1,9 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "if_mkl_ml",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
@@ -13,11 +18,6 @@ load(
     "tf_openmp_lopts",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_cuda_cc_test")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "if_mkl_ml",
-)
 
 # For platform specific build config
 load(
@@ -2910,6 +2910,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:queue_ops",
         "//tensorflow/core/kernels:session_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
index 2b2debbd90458a..31a101a421a284 100644
--- a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/dump_graph.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/common_runtime/cost_constants.h b/tensorflow/core/common_runtime/cost_constants.h
index d580a95370bfd7..a19c567dc0a2f4 100644
--- a/tensorflow/core/common_runtime/cost_constants.h
+++ b/tensorflow/core/common_runtime/cost_constants.h
@@ -30,12 +30,15 @@ inline constexpr char kNoOpCostName[] = "no_op";
 // '_with_smear" includes this part.
 inline constexpr char kWithSmearSuffix[] = "_with_smear";
 inline constexpr char kNoSmearSuffix[] = "_no_smear";
+inline constexpr char kNonBatchingSuffix[] = "_non_batching";
 
 // Full names of per-request cost.
 inline constexpr char kTpuWithSmearCostName[] = "tpu_with_smear";
 inline constexpr char kTpuNoSmearCostName[] = "tpu_no_smear";
+inline constexpr char kTpuNonBatchingCostName[] = "tpu_non_batching";
 inline constexpr char kGcuWithSmearCostName[] = "gcu_with_smear";
 inline constexpr char kGcuNoSmearCostName[] = "gcu_no_smear";
+inline constexpr char kGcuNonBatchingCostName[] = "gcu_non_batching";
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 68cba285ec4a8a..7ba65fe96edd22 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/protobuf.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -1165,7 +1166,8 @@ class SessionMetadataReaderOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("y", TensorShape({}), &out_tensor));
     if (ctx->session_metadata() != nullptr) {
-      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
+      out_tensor->scalar<tstring>()() =
+          tsl::LegacyUnredactedDebugString(*ctx->session_metadata());
     } else {
       out_tensor->scalar<tstring>()() = "";
     }
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index b2af36ec848ce8..cb4d241e639f44 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,5 +1,13 @@
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
+    "clean_dep",
+    "if_google",
     "if_zendnn",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -8,11 +16,6 @@ load(
     "tf_mkl_kernel_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "mkl_deps",
-)
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
@@ -121,7 +124,7 @@ tf_cuda_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -234,12 +237,50 @@ tf_cuda_library(
             "//tensorflow/core/distributed_runtime/eager:cluster_function_library_runtime",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
+            "//tensorflow/core/tfrt/common:global_state",
+            "//tensorflow/core/tfrt/common:pjrt_state",
+            "//tensorflow/core/tfrt/common:pjrt_util",
+            "@com_google_absl//absl/base:core_headers",
+            "@com_google_absl//absl/container:flat_hash_map",
+            "@com_google_absl//absl/log",
+            "@com_google_absl//absl/status",
+            "@com_google_absl//absl/status:statusor",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/synchronization",
+            "@com_google_absl//absl/time",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
             "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_impl",
             "@local_tsl//tsl/distributed_runtime/preemption:preemption_notifier",
+            "@local_tsl//tsl/platform:mutex",
+            "@local_tsl//tsl/platform:statusor",
+            "@local_xla//xla/pjrt:pjrt_stream_executor_client",
+            "@local_xla//xla/pjrt/gpu:se_gpu_pjrt_client",
         ],
-    }),
+    }) + if_google(
+        # TODO(b/282068262): PJRT pulls in TFRT components that are incompatible with ARM platform.
+        # Clean up so that PJRT can run on ARM (and remove "#if defined(PLATFORM_GOOGLE) ..." use
+        # from gpu_util.cc).
+        # Also it won't build with WeightWatcher which tracks OSS build binaries.
+        # TODO(b/290533709): Clean up this build rule.
+        selects.with_or({
+            clean_dep("//tensorflow:linux_x86_64_with_weightwatcher"): [],
+            (
+                clean_dep("//tensorflow:linux_x86_64"),
+                clean_dep("//tensorflow:haswell"),
+            ): [
+                "//tensorflow/core",
+                "//tensorflow/core/framework:resource_base",
+                "@local_xla//xla/pjrt/distributed:key_value_store_interface",
+                "@local_xla//xla/pjrt:local_device_state",
+                "@local_xla//xla/pjrt:pjrt_client",
+                "@local_xla//xla/pjrt:pjrt_compiler",
+                "@local_xla//xla/service/gpu:gpu_executable_run_options",
+                "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
+            ],
+            "//conditions:default": [],
+        }),
+    ),
 )
 
 tf_cc_test(
@@ -597,10 +638,12 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -765,10 +808,12 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@local_xla//xla/tsl/util:env_var",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 594ea4bf045388..0677e45b4c83a6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -46,6 +46,7 @@ limitations under the License.
 
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
@@ -63,7 +64,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 3393e08481fb72..3abc2605144133 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -18,14 +18,17 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
+#include <map>
 #include <memory>
 #include <numeric>
 #include <string>
+#include <string_view>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -48,9 +51,19 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_filters.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
@@ -63,23 +76,315 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #endif  // !IS_MOBILE_PLATFORM
 
+#if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
+#define TF_GPU_USE_PJRT
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "xla/pjrt/local_device_state.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tfrt/common/global_state.h"
+#include "tensorflow/core/tfrt/common/pjrt_state.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
+#endif
+
 namespace tensorflow {
 
+#if !defined(IS_MOBILE_PLATFORM)
+namespace {
+
 // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
 // server object (which currently CHECK-fails) and we miss the error, instead,
 // we log the error, and then return to allow the user to see the error
 // message.
-#define LOG_AND_RETURN_IF_ERROR(...)                  \
-  do {                                                \
-    const tensorflow::Status _status = (__VA_ARGS__); \
-    if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.message();                \
-      return _status;                                 \
-    }                                                 \
+#define LOG_AND_RETURN_IF_ERROR(...)            \
+  do {                                          \
+    const absl::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {      \
+      LOG(ERROR) << _status.message();          \
+      return _status;                           \
+    }                                           \
   } while (0);
 
-#if !defined(IS_MOBILE_PLATFORM)
-namespace {
+#ifdef TF_GPU_USE_PJRT
+// Provide a KeyValue interface to the coordination service agent for use by
+// BuildDistributedDevices.
+class XlaKeyValueStore : public xla::KeyValueStoreInterface {
+ public:
+  explicit XlaKeyValueStore(
+      tsl::CoordinationServiceAgent* coordination_service_agent,
+      std::string key_prefix = "gpu:")
+      : coordination_service_agent_(coordination_service_agent),
+        key_prefix_(key_prefix) {}
+  absl::StatusOr<std::string> Get(std::string_view key,
+                                  absl::Duration timeout) override {
+    return coordination_service_agent_->GetKeyValue(
+        absl::StrCat(key_prefix_, key), timeout);
+  }
+
+  absl::Status Set(std::string_view key, std::string_view value) override {
+    return coordination_service_agent_->InsertKeyValue(
+        absl::StrCat(key_prefix_, key), value);
+  };
+
+ private:
+  tsl::CoordinationServiceAgent* coordination_service_agent_;
+  std::string key_prefix_;
+};
+
+// Remove LocalDeviceState objects from
+// info->local_device_states that have unique hardware IDs
+// (i.e. ignore duplicate virtual devices) and return them in a map.
+static std::map<int, std::unique_ptr<xla::LocalDeviceState>>
+GetUniqueDeviceStates(PjRtGpuClientCreationInfo* info) {
+  // Only consider each hardware device once. In test environments, one
+  // physical GPU (e.g. hardware_id 0) might be shared as virtual GPUs (e.g.
+  // local_id 0 and 1) by multiple workers (multiple processes on the same
+  // computer). If there is a need to not ignore these for an actual case, a
+  // possible solution is to add a flag to only enable the use of
+  // hardware_id_to_local_id for tests.
+
+  auto input_states = std::move(info->local_device_states);
+
+  absl::flat_hash_map<int, int> hardware_id_to_local_id;
+  for (const auto& id_state : input_states) {
+    int local_id = id_state.second->local_device_id().value();
+    int hardware_id = id_state.second->local_hardware_id().value();
+    if (hardware_id_to_local_id.contains(hardware_id)) {
+      if (hardware_id_to_local_id[hardware_id] > local_id) {
+        // Use the device with the smallest local_id, ignore others.
+        hardware_id_to_local_id[hardware_id] = local_id;
+      }
+    } else {
+      hardware_id_to_local_id[hardware_id] = local_id;
+    }
+  }
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
+  for (auto& id_state : input_states) {
+    int local_id = id_state.second->local_device_id().value();
+    int hardware_id = id_state.second->local_hardware_id().value();
+    if (hardware_id_to_local_id[hardware_id] != local_id) {
+      VLOG(1) << "For hardware_id=" << hardware_id
+              << ", ignoring redundant local_id=" << local_id
+              << ". local_id=" << hardware_id_to_local_id[hardware_id]
+              << " will be used instead.";
+      continue;
+    }
+    local_device_states.emplace(id_state.first, std::move(id_state.second));
+  }
+  return local_device_states;
+}
+
+// Coordinate creation of a PjRt GPU client with distributed devices when there
+// are multiple threads (which typically occurs in test environments that use
+// multiple threads to simulate multiple workers).
+class ClientCreationState : public ResourceBase {
+ public:
+  static ClientCreationState* Create() { return new ClientCreationState(); }
+
+  // Returns true for the thread that should create the PjRt GPU client,
+  // which is first thread to call this method.
+  // Each thread should only call this once (i.e. it will return
+  // false after the first call). This modifies internal state (i.e. the first
+  // call clears `first_task_`).
+  bool FirstThread() ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock lock(&mu_);
+    if (first_task_) {
+      first_task_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  // The first task signals other tasks that it is ready to run
+  // BuildDistributedDevices.
+  void SetReady() { ready_notification_.Notify(); }
+
+  // The first task signals other tasks that either it has finished either after
+  // creating a client or an error occurred.
+  void SetDone() { done_notification_.Notify(); }
+
+  // The first task signals other tasks that an error occurred.
+  // done_notification_ is notified first and then ready_notification_ is
+  // notified second so any task waiting for ready_notification_ will have the
+  // proper value for done_notification_ before it unblocks to use for
+  // determining if an error occurred.
+  void SetErrorBeforeReady() {
+    done_notification_.Notify();
+    ready_notification_.Notify();
+  }
+
+  // Wait for the first task to notify that it is either ready to run
+  // BuildDistributedDevices and return true or there is an error and return
+  // false.
+  bool WaitForReadyOrError() {
+    ready_notification_.WaitForNotification();
+    return !done_notification_.HasBeenNotified();
+  }
+
+  // Wait for first task to notify that it created a client or there is an
+  // error.
+  void WaitForDone() { done_notification_.WaitForNotification(); };
+
+  std::string DebugString() const override { return "ClientCreationState"; }
+
+ private:
+  absl::Mutex mu_;
+
+  // Only the task with `first_task_ == true` (the first task) creates
+  // the PjRt GPU client.
+  bool first_task_ ABSL_GUARDED_BY(mu_) = true;
+
+  // The first task notifies any other tasks just before it is ready to run
+  // BuildDistributedDevices or if there is an error. It must notify in every
+  // codepath in the first thread, esp. every early return for errors, etc.,
+  // i.e. an error might need to notify both ready_notification_ and
+  // done_notification_.
+  Notification ready_notification_;
+
+  // The first task notifies after the PjRT GPU client is created or if
+  // there is an error. It must notify every codepath in the first
+  // thread, esp. every early return for errors, etc., i.e. an error might need
+  // to notify both ready_notification_ and done_notification_.
+  Notification done_notification_;
+};
+
+absl::StatusOr<ClientCreationState*> GetOrCreateClientCreationState() {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  ClientCreationState* client_creation_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<ClientCreationState>(
+      rmgr->default_container(), kPjRtStateResourceName, &client_creation_state,
+      [&](ClientCreationState** ret) {
+        *ret = ClientCreationState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref client_creation_state_ref(client_creation_state);
+  return client_creation_state;
+}
+
+// PjRt GPU specific code for creating a PjRt GPU client that knows about
+// remote devices. If run by multiple threads (which is done by tests using
+// threads to simulate multiple workers), all threads run
+// BuildDistributedDevices but only one thread creates the client.
+// Synchronization assures that if the task designated to create the client
+// exits early without running BuildDistributedDevices, all tasks exit early.
+// For the usual case where a client is created, synchonization assures that the
+// tasks that do not create the client wait for client creation before exiting.
+absl::Status CreateClientOnce(
+    int node_id, int num_nodes,
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
+  TF_ASSIGN_OR_RETURN(auto creation_state, GetOrCreateClientCreationState());
+  bool use_creation_info = creation_state->FirstThread();
+
+  PjRtGpuClientCreationInfo* info = nullptr;
+  if (use_creation_info) {
+    auto obtained_info = GetPjRtGpuClientCreationInfo();
+    if (!obtained_info.ok()) {
+      creation_state->SetErrorBeforeReady();
+      return obtained_info.status();
+    }
+    info = obtained_info.value();
+    if (info == nullptr) {
+      VLOG(2) << "No PjRtGpuClientCreationInfo in CreateClientOnce";
+      creation_state->SetErrorBeforeReady();
+      return absl::OkStatus();
+    }
+    VLOG(2) << "Creating PjRtGpuClientCreationInfo in CreateClientOnce.";
+  } else {
+    LOG(INFO)
+        << "Skipping using GetPjRtGpuClientCreationInfo in CreateClientOnce "
+           "since it has already been used. This is expected in tests that use "
+           "multiple threads to simulate multiple workers. If this occurs in "
+           "production and op execution on GPU fails, this could be related.";
+    if (!creation_state->WaitForReadyOrError()) {
+      LOG(INFO) << "In CreateClientOnce, first thread exited early, causing "
+                   "this thread to exit.";
+      return absl::OkStatus();
+    }
+  }
+
+  std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices;
+  auto gpu_run_options = std::make_unique<xla::gpu::GpuExecutableRunOptions>();
+#if TENSORFLOW_USE_ROCM
+  auto platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+  auto pjrt_platform_name = xla::SyclName();
+#else   // TENSORFLOW_USE_ROCM
+  auto platform_name = xla::CudaName();
+#endif  // TENSORFLOW_USE_ROCM
+
+  auto kv_store =
+      std::make_shared<XlaKeyValueStore>(coordination_service_agent);
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>>
+      unique_local_device_states;
+  if (use_creation_info) {
+    unique_local_device_states = GetUniqueDeviceStates(info);
+  }
+  if (use_creation_info) {
+    // Tell any other threads are waiting to call BuildDistributedDevices to
+    // proceed.
+    creation_state->SetReady();
+  }
+  auto status = BuildDistributedDevices(
+      platform_name, std::move(unique_local_device_states), node_id, num_nodes,
+      &pjrt_devices, gpu_run_options.get(), kv_store,
+      /*enable_mock_nccl=*/false);
+  if (!status.ok()) {
+    if (use_creation_info) {
+      creation_state->SetDone();
+    }
+    return status;
+  }
+
+  VLOG(2) << "Distributed devices built with size=" << pjrt_devices.size();
+  int i = 0;
+  for (const auto& pjrt_device : pjrt_devices) {
+    if (pjrt_device != nullptr) {
+      VLOG(2) << "  pjrt_device " << i++ << ":"
+              << pjrt_device->description().DebugString();
+    } else {
+      VLOG(2) << "  pjrt_device " << i++ << ":" << "nullptr";
+    }
+  }
+
+  if (use_creation_info) {
+    std::unique_ptr<xla::PjRtClient> pjrt_client =
+        std::make_unique<xla::StreamExecutorGpuClient>(
+            platform_name, info->local_client, std::move(pjrt_devices),
+            /*process_index=*/node_id,
+            /*allocator=*/std::move(info->allocator),
+            /*host_memory_allocator=*/std::move(info->host_memory_allocator),
+            /*should_stage_host_to_device_transfers=*/true,
+            /*gpu_run_options=*/std::move(gpu_run_options));
+    VLOG(2) << "PJRT GPU client with remote devices created.";
+    status = SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
+                                                    std::move(pjrt_client));
+    creation_state->SetDone();
+    return status;
+  } else {
+    LOG(INFO) << "Skipping creating PJRT GPU client, another thread has "
+                 "already created the client.";
+    creation_state->WaitForDone();
+    return absl::OkStatus();
+  }
+}
+#endif  // TF_GPU_USE_PJRT
+
+absl::Status CreatePjRtGpuClientWithDistributedDevices(
+    int node_id, int num_nodes,
+    tsl::CoordinationServiceAgent* coordination_service_agent) {
+#ifdef TF_GPU_USE_PJRT
+  if (num_nodes <= 1) {
+    return absl::OkStatus();
+  }
+  return CreateClientOnce(node_id, num_nodes, coordination_service_agent);
+#else   // TF_GPU_USE_PJRT
+  return absl::OkStatus();
+#endif  // TF_GPU_USE_PJRT
+}
 
 bool AreLocalDevicesCompatible(const EagerContext* context,
                                const ServerDef& server_def) {
@@ -416,19 +721,6 @@ Status UpdateContextWithServerDef(EagerContext* context,
                                   bool reset_context, int keep_alive_secs,
                                   int64_t init_timeout_in_ms, int retries,
                                   bool clear_existing_contexts = false) {
-  // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
-  // server object (which currently CHECK-fails) and we miss the error, instead,
-  // we log the error, and then return to allow the user to see the error
-  // message.
-#define LOG_AND_RETURN_IF_ERROR(...)                  \
-  do {                                                \
-    const tensorflow::Status _status = (__VA_ARGS__); \
-    if (TF_PREDICT_FALSE(!_status.ok())) {            \
-      LOG(ERROR) << _status.message();                \
-      return _status;                                 \
-    }                                                 \
-  } while (0);
-
   string worker_name =
       strings::StrCat("/job:", server_def.job_name(),
                       "/replica:0/task:", server_def.task_index());
@@ -699,12 +991,16 @@ Status EagerContextDistributedManager::SetOrUpdateServerDef(
   Status s = UpdateContextWithServerDef(context_, server_def, reset_context,
                                         keep_alive_secs, init_timeout_in_ms,
                                         retries, clear_existing_contexts);
+  if (!s.ok()) {
+    coordination_service_agent_ = nullptr;
+    return s;
+  }
   // If context is reset, make sure pointer is set to the new agent.
   coordination_service_agent_ =
       context_->GetServer()
           ->worker_env()
           ->session_mgr->GetCoordinationServiceAgent();
-  return s;
+  return absl::OkStatus();
 }
 
 Status EagerContextDistributedManager::InitializeLocalOnlyContext(
@@ -844,6 +1140,27 @@ Status EagerContextDistributedManager::EnableCollectiveOps(
       LOG_AND_RETURN_IF_ERROR(coordination_service_agent_->Connect());
       LOG_AND_RETURN_IF_ERROR(
           coordination_service_agent_->WaitForAllTasks(devices));
+      // Coordination service agent is now connected.
+
+      // Convert nested "job name" and "task index" into a flat "node_id" index
+      // in 0..num_nodes-1. num_nodes is the sum of the number of tasks in each
+      // job.
+      int num_nodes = 0;
+      int node_id = -1;
+      const std::string& job_name = server_def.job_name();
+      int task_index = server_def.task_index();
+      for (const auto& job : server_def.cluster().job()) {
+        if (job.name() == job_name) {
+          node_id = num_nodes + task_index;
+        }
+        num_nodes += job.tasks().size();
+      }
+      VLOG(2)
+          << "Creating PJRT client with distributed devices with num_nodes: "
+          << num_nodes << ", node_id: " << node_id;
+
+      LOG_AND_RETURN_IF_ERROR(CreatePjRtGpuClientWithDistributedDevices(
+          node_id, num_nodes, coordination_service_agent_));
 
       // Add remote devices to eager context.
       std::vector<std::unique_ptr<Device>> remote_devices;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5faa7f1d2e9a86..57af63ddb05e3d 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -121,6 +122,16 @@ auto* top_level_jit_compilation_counter = monitoring::Counter<1>::New(
     "The number of times a top-level JIT-compiled function is called.",
     "device");
 
+bool SendAsProtosWhenPossible() {
+  static bool send_as_protos_when_possible = []() {
+    bool result;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_SEND_AS_PROTOS_WHEN_POSSIBLE",
+                                        false, &result));
+    return result;
+  }();
+  return send_as_protos_when_possible;
+}
+
 const string& DeviceNameOrUnspecified(Device* device) {
   static string* unspecified_string = new string("<unspecified>");
   return (device == nullptr) ? *unspecified_string : device->name();
@@ -1915,21 +1926,41 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                                               ctx.GetContextViewId()));
         }
       }
-      auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
-      // For a remote component function, a function execution request and an
-      // input generation request may come from different workers. We need to
-      // guarantee that the input generation request is processed before the
-      // function execution request, so wait until the remote input is ready
-      // before sending it to the multi-device function device.
-      bool wait_until_ready =
-          SkipRemoteHandleWaitReady() ? false : op->is_function();
-      TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, wait_until_ready, input_handle, input_device,
-          *input_device_name, serialize_resource_dtype_and_shape));
-      if (!input_handle->resource_dtypes_and_shapes().empty()) {
-        TF_RETURN_IF_ERROR(
-            input->AddResourceShapeMirror(op_device, input_handle->op_id(),
-                                          input_handle->output_num(), &ctx));
+      int64_t num_elements;
+      TF_RETURN_IF_ERROR(input->NumElements(&num_elements));
+      if ((input->Type() == TensorHandle::HandleType::LOCAL) &&
+          (num_elements == 1) && (input->DataType() != DT_VARIANT) &&
+          SendAsProtosWhenPossible()) {
+        auto* input_tensor_proto = remote_op->add_op_inputs()->mutable_tensor();
+        const tensorflow::Tensor* input_tensor = nullptr;
+        TensorHandle* local_cpu_input_handle = nullptr;
+        TF_RETURN_IF_ERROR(EagerCopyToDevice(input, &ctx, &ctx.Executor(),
+                                             ctx.HostCPU(), false,
+                                             &local_cpu_input_handle));
+        TF_RETURN_IF_ERROR(local_cpu_input_handle->Tensor(&input_tensor));
+        input_tensor->AsProtoTensorContent(input_tensor_proto);
+        // `TensorHandle::AddResourceShapeMirror` can change `input` but only if
+        // `TensorHandle::handle_dtypes_and_shapes_` is not empty. And that
+        // requires `TensorHandle::dtype` to be equal to `DT_RESOURCE` which
+        // cannot be the case when we are here. So nothing else to do.
+      } else {
+        auto* input_handle =
+            remote_op->add_op_inputs()->mutable_remote_handle();
+        // For a remote component function, a function execution request and an
+        // input generation request may come from different workers. We need to
+        // guarantee that the input generation request is processed before the
+        // function execution request, so wait until the remote input is ready
+        // before sending it to the multi-device function device.
+        bool wait_until_ready =
+            SkipRemoteHandleWaitReady() ? false : op->is_function();
+        TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
+            input, wait_until_ready, input_handle, input_device,
+            *input_device_name, serialize_resource_dtype_and_shape));
+        if (!input_handle->resource_dtypes_and_shapes().empty()) {
+          TF_RETURN_IF_ERROR(
+              input->AddResourceShapeMirror(op_device, input_handle->op_id(),
+                                            input_handle->output_num(), &ctx));
+        }
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index ebedabf3eef3ee..02e032e604e1de 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index db1069acf6fc01..724f998e6f0f30 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 7ab30b3d1e063a..84e935fed6c684 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -449,6 +449,7 @@ cc_library(
     features = ["-layering_check"],
     deps = [
         ":gpu_scheduling_metrics_storage",
+        "//tensorflow/core/framework:resource_base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:node_hash_map",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index af4dc4dcb4bbe7..6680e9622f04e8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1410,12 +1410,33 @@ Status BaseGPUDeviceFactory::CreateDevices(
     num_gpus_to_use = iter->second;
   }
   const auto& gpu_options = options.config.gpu_options();
+  bool populate_pjrt_gpu_client_creation_info =
+      gpu_options.experimental().populate_pjrt_gpu_client_creation_info();
+
+#ifdef TF_GPU_USE_PJRT
+  absl::StatusOr<PjRtGpuClientCreationInfo*> obtained_info =
+      GetPjRtGpuClientCreationInfo();
+  if (obtained_info.ok() && obtained_info.value() != nullptr) {
+    populate_pjrt_gpu_client_creation_info = false;
+    VLOG(3) << "Previous GetPjRtGpuClientCreationInfo exists, setting "
+               "populate_pjrt_gpu_client_creation_info to false.";
+  } else {
+    VLOG(3)
+        << "Previous GetPjRtGpuClientCreationInfo does not exist. Will create.";
+  }
+#endif
+
   std::vector<tsl::PlatformDeviceId> visible_gpu_order;
   std::vector<tsl::PlatformDeviceId> valid_platform_device_ids;
   // If we aren't going to use any GPUs, don't initialize them.
   // We don't want to call ParseVisibleDeviceList if num_gpus_to_use is 0,
   // because it treats an empty gpu_options.visible_device_list as 'all GPUs
   // are visible'.
+  VLOG(3) << "CreateGPUDevice: num_gpus_to_use: " << num_gpus_to_use
+          << ", visible_device_list: " << gpu_options.visible_device_list()
+          << ", populate_pjrt_gpu_client_creation_info: "
+          << populate_pjrt_gpu_client_creation_info;
+
   if (num_gpus_to_use > 0) {
     TF_RETURN_IF_ERROR(tsl::ParseVisibleDeviceList(
         gpu_options.visible_device_list(), gpu_manager->VisibleDeviceCount(),
@@ -1764,7 +1785,13 @@ Status BaseGPUDeviceFactory::CreateDevices(
     }
 
     xla::LocalDeviceState* local_device_state = nullptr;
-    if (should_create_new_pjrt_client) {
+    if (should_create_new_pjrt_client ||
+        populate_pjrt_gpu_client_creation_info) {
+      VLOG(3) << "should_create_new_pjrt_client="
+              << should_create_new_pjrt_client
+              << ", populate_pjrt_gpu_client_creation_info="
+              << populate_pjrt_gpu_client_creation_info
+              << " for device ordinal " << di;
       const int priority = GetPriority(tf_device_id.value(), gpu_options);
       xla::LocalDeviceState::StreamOptions stream_options;
       int num_d2d_streams =
@@ -1787,12 +1814,17 @@ Status BaseGPUDeviceFactory::CreateDevices(
                   /*allow_event_reuse=*/true, /*use_callback_stream=*/true,
                   /*device_ordinal=*/di, /*stream_options=*/stream_options));
       if (!emplace_result.second) {
+        LOG(ERROR) << "Creating LocalDeviceState for device ordinal: " << di
+                   << " already exists! Returning an error";
         return absl::InternalError(absl::StrCat(
             "GPU local device state for tf_device_id: ", tf_device_id.value(),
             " already exists."));
       }
       local_device_state = emplace_result.first->second.get();
     } else {
+      VLOG(3) << "should_create_new_pjrt_client="
+              << should_create_new_pjrt_client << " for device ordinal " << di
+              << ". Re-using local_device_state";
       auto* pjrt_se_client =
           tensorflow::down_cast<xla::PjRtStreamExecutorClient*>(
               *obtained_pjrt_client);
@@ -1806,7 +1838,8 @@ Status BaseGPUDeviceFactory::CreateDevices(
         /*dev_locality=*/it->second,
         /*xla_local_device_state=*/local_device_state, gpu_allocator, devices));
 
-    if (should_create_new_pjrt_client) {
+    if (should_create_new_pjrt_client ||
+        populate_pjrt_gpu_client_creation_info) {
       auto gpu_allocator_ptr = std::unique_ptr<Allocator>(gpu_allocator);
       allocator_id_stream_tuples.emplace_back(
           std::move(gpu_allocator_ptr), local_device_state->compute_stream(), 0,
@@ -1819,7 +1852,10 @@ Status BaseGPUDeviceFactory::CreateDevices(
     return OkStatus();
   }
 
-  if (should_create_new_pjrt_client) {
+  if (should_create_new_pjrt_client || populate_pjrt_gpu_client_creation_info) {
+    VLOG(3) << "should_create_new_pjrt_client=" << should_create_new_pjrt_client
+            << ", populate_pjrt_gpu_client_creation_info="
+            << populate_pjrt_gpu_client_creation_info;
     auto allocator_adapter = std::make_unique<se::MultiDeviceAdapter>(
         gpu_manager, std::move(allocator_id_stream_tuples));
 
@@ -1829,34 +1865,83 @@ Status BaseGPUDeviceFactory::CreateDevices(
     std::unique_ptr<tsl::Allocator> pjrt_gpu_host_allocator(
         process_state->GetGpuHostAllocator(/*options=*/{}, numa_node));
 
-    std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices =
-        xla::BuildLocalDevices(std::move(local_device_states),
-                               /*node_id=*/numa_node);
+    if (populate_pjrt_gpu_client_creation_info &&
+        !should_create_new_pjrt_client) {
+      auto pjrt_gpu_client_creation_info =
+          std::make_unique<PjRtGpuClientCreationInfo>();
+
+      pjrt_gpu_client_creation_info->allocator = std::move(allocator_adapter);
+      pjrt_gpu_client_creation_info->host_memory_allocator =
+          std::move(pjrt_gpu_host_allocator);
+      pjrt_gpu_client_creation_info->local_device_states =
+          std::move(local_device_states);
+      pjrt_gpu_client_creation_info->local_client = xla_client;
+      pjrt_gpu_client_creation_info->allowed_devices =
+          std::move(allowed_devices);
 
-    auto& pjrt_rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
-    pjrt_rollout_config.AllowForDeviceInXlaLaunch(DEVICE_GPU);
-    pjrt_rollout_config.AllowForDeviceInXlaCompileOnDemand(DEVICE_GPU);
-    pjrt_rollout_config.AllowForDeviceInXlaCompileAndRun(DEVICE_GPU);
+      return SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+          std::move(pjrt_gpu_client_creation_info));
+    }
 
-    // Creates PJRT GPU client and places it into a TF global resource manager.
-    auto gpu_run_options =
-        std::make_unique<xla::gpu::GpuExecutableRunOptions>();
+    if (should_create_new_pjrt_client) {
+      // The first time BaseGPUDeviceFactory::CreateDevices is called, a PJRT
+      // GPU client is always created (assuming the source code for PJRT GPU
+      // support is included in the build). This client only has information
+      // about local devices.
+      //
+      // If later a client is created with information about both local and
+      // remote devices (i.e. if there are multiple nodes/jobs/tasks and
+      // collectives are enabled), that newer client will be used instead from
+      // then on. Also, in certain test cases (e.g. to simulate changes to the
+      // number of GPUs), this code block may be run multiple times and create a
+      // client more than once. When a new client is created, the old client
+      // will not be used for any new executions but will be kept so that any
+      // buffers for computations in progress remain valid.
+      //
+      // Otherwise, once a client is created by the first call, it is the only
+      // client that is created/used and future calls skip this code block.
+      int node_id = gpu_options.experimental().node_id();
+      std::vector<std::unique_ptr<xla::PjRtStreamExecutorDevice>> pjrt_devices =
+          xla::BuildLocalDevices(std::move(local_device_states),
+                                 /*node_id=*/node_id);
+
+      auto& pjrt_rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+      pjrt_rollout_config.AllowForDeviceInXlaLaunch(DEVICE_GPU);
+      pjrt_rollout_config.AllowForDeviceInXlaCompileOnDemand(DEVICE_GPU);
+      pjrt_rollout_config.AllowForDeviceInXlaCompileAndRun(DEVICE_GPU);
+
+      // Creates PJRT GPU client and places it into a TF global resource
+      // manager.
+      auto gpu_run_options =
+          std::make_unique<xla::gpu::GpuExecutableRunOptions>();
 #if TENSORFLOW_USE_ROCM
-    auto platform_name = xla::RocmName();
+      auto platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+      auto pjrt_platform_name = xla::SyclName();
 #else   // TENSORFLOW_USE_ROCM
-    auto platform_name = xla::CudaName();
+      auto platform_name = xla::CudaName();
 #endif  // TENSORFLOW_USE_ROCM
-    std::unique_ptr<xla::PjRtClient> pjrt_client =
-        std::make_unique<xla::StreamExecutorGpuClient>(
-            platform_name, xla_client, std::move(pjrt_devices),
-            /*process_index=*/numa_node,
-            /*allocator=*/std::move(allocator_adapter),
-            /*host_memory_allocator=*/std::move(pjrt_gpu_host_allocator),
-            /*should_stage_host_to_device_transfers=*/true,
-            /*gpu_run_options=*/std::move(gpu_run_options));
-
-    return SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
-                                                  std::move(pjrt_client));
+      std::unique_ptr<xla::PjRtClient> pjrt_client =
+          std::make_unique<xla::StreamExecutorGpuClient>(
+              platform_name, xla_client, std::move(pjrt_devices),
+              /*process_index=*/numa_node,
+              /*allocator=*/std::move(allocator_adapter),
+              /*host_memory_allocator=*/std::move(pjrt_gpu_host_allocator),
+              /*should_stage_host_to_device_transfers=*/true,
+              /*gpu_run_options=*/std::move(gpu_run_options));
+
+      return SetPjRtClientInTFGlobalResourceManager(DeviceType(DEVICE_GPU),
+                                                    std::move(pjrt_client));
+    }
+
+    LOG(INFO)
+        << "Unexpectedly returning OK STATUS in "
+           "BaseGPUDeviceFactory::CreateDevices without creating a PJRT GPU "
+           "client when one does not already exist. If there is any problem "
+           "with GPU computations, file a bug. (But if this occurs in a "
+           "test environment that doesn't actually perform GPU "
+           "computations, this might not be a problem.)";
+    return OkStatus();
   } else {
     return obtained_pjrt_client.status();
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index b3d85d8d792d7e..9dc7530fc1e5d9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/integrations/device_mem_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
 #include "tensorflow/core/common_runtime/device_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tsl/platform/mutex.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
index a63a51508da008..1a3af85a22af24 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
@@ -24,10 +24,32 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_base.h"
 #include "tsl/framework/serving_device_selector.h"
 
 namespace tensorflow {
 namespace gpu {
+class GpuServingDeviceSelector;
+const char kGpuServingDeviceSelectorResourceName[] =
+    "gpu_serving_device_selector";
+
+class GpuServingDeviceSelectorResource : public ResourceBase {
+ public:
+  explicit GpuServingDeviceSelectorResource(
+      int num_devices, std::unique_ptr<tsl::ServingDeviceSelector::Policy>
+                           device_selector_policy)
+      : selector_(std::make_unique<GpuServingDeviceSelector>(
+            num_devices, std::move(device_selector_policy))) {}
+
+  std::string DebugString() const override {
+    return "GpuServingDeviceSelectorResource";
+  };
+
+  GpuServingDeviceSelector* selector() const { return selector_.get(); }
+
+ private:
+  std::unique_ptr<GpuServingDeviceSelector> selector_;
+};
 
 class GpuServingDeviceSelector : public tsl::ServingDeviceSelector {
  public:
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 3d36b12d44b3df..3272940402b737 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index 545e6017672410..b07376c1614788 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -246,9 +246,7 @@ cc_library(
     name = "loose_headers",
     tags = ["avoid_dep"],
     textual_hdrs = ["c_plugin_op_kernel.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 cc_library(
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
index a49f2aa46aefc4..f9235ce4fadc23 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/common_runtime/node_file_writer.cc b/tensorflow/core/common_runtime/node_file_writer.cc
index 10fb334b38f8d3..83b95c7010b359 100644
--- a/tensorflow/core/common_runtime/node_file_writer.cc
+++ b/tensorflow/core/common_runtime/node_file_writer.cc
@@ -66,7 +66,7 @@ const int kMaxInt32Elems = 10;
 
 namespace tensorflow {
 
-/*static*/ StatusOr<NodeFileWriter*>
+/*static*/ absl::StatusOr<NodeFileWriter*>
 tensorflow::NodeFileWriter::GetNodeFileWriterIfEnabled(
     const std::string& device_name, Env* env) {
   // First get the directory from TF_NODE_FILE_WRITER_DIRECTORY.
diff --git a/tensorflow/core/common_runtime/node_file_writer.h b/tensorflow/core/common_runtime/node_file_writer.h
index f0e0ae830c6e48..bf6a2ddafd114d 100644
--- a/tensorflow/core/common_runtime/node_file_writer.h
+++ b/tensorflow/core/common_runtime/node_file_writer.h
@@ -37,7 +37,7 @@ class NodeFileWriter {
   // TF_NODE_FILE_WRITER_DIRECTORY is set, which specifies the directory where
   // the node file will be created in. Otherwise, returns nullptr. When called
   // with the same device_name, the same NodeFileWriter will be returned.
-  static StatusOr<NodeFileWriter*> GetNodeFileWriterIfEnabled(
+  static absl::StatusOr<NodeFileWriter*> GetNodeFileWriterIfEnabled(
       const std::string& device_name, Env* env);
 
   // Records the execution of a node, if eligible, by writing the node to the
diff --git a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
index 6deb020f816b31..28c8309f823240 100644
--- a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
+++ b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
@@ -78,7 +78,7 @@ class DeviceLookup {
  public:
   DeviceLookup() = default;
 
-  static StatusOr<DeviceLookup> FromGraph(Graph* graph) {
+  static absl::StatusOr<DeviceLookup> FromGraph(Graph* graph) {
     DeviceLookup lookup;
     for (Node* n : graph->op_nodes()) {
       string device;
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index 357520c827c393..5ed05fc3486823 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/optimized_function_graph.pb.h"
@@ -207,8 +208,8 @@ Status WriteToCache(const std::string& dir_name, const std::string& file_name,
 
 // Retrieves the OptimizedFunctionGraphInfo from a cache file.
 // Returns error if cache file loading fails.
-StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(const string& file_name,
-                                                   Env* env) {
+absl::StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(
+    const string& file_name, Env* env) {
   absl::Time cache_reading_start_time = absl::Now();
 
   OptimizedFunctionGraph optimized_function_graph_proto;
@@ -463,7 +464,7 @@ Status PinArgsAndRets(const std::vector<string>& input_devices,
   return absl::OkStatus();
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+absl::StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -640,14 +641,19 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 
   graph->mutable_flib_def()->set_default_registry(nullptr);
   graph->mutable_flib_def()->Clear();
+
+  FunctionLibraryDefinition pruned_lib_def =
+      reachable_lib_def.ReachableDefinitions(*graph);
+
   return OptimizedFunctionGraphInfo(
-      function_name, std::move(graph), std::move(reachable_lib_def),
+      function_name, std::move(graph), std::move(pruned_lib_def),
       node_name_to_control_ret, ret_types, ret_nodes.size(),
       env->NowMicros() - graph_optimization_start_time_usecs,
       optimization_source);
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizeFunctionGraphOrReadFromFileCache(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -688,7 +694,7 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
         << "TensorFlow graph cache existed; reading from cache; function name: "
         << function_name << ", full cache file path: " << file_name;
 
-    StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
+    absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
         ReadFromCache(file_name, env);
     if (optimized_function_graph_info.ok()) {
       metrics::UpdateFunctionGraphOptimizationSavingTime(
@@ -729,7 +735,7 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
   // Step 1: Run the graph optimization passes normally.
   absl::Time optimization_start_time = absl::Now();
   TF_ASSIGN_OR_RETURN(
-      StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info,
+      absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info,
       OptimizeFunctionGraph(function_name, attrs, options, dev_set,
                             input_lib_def, composite_devices, cpu_device,
                             default_device, env, OptimizedFunctionGraph::JIT));
@@ -765,7 +771,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
   return optimized_function_graph_info;
 }
 
-StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
+absl::StatusOr<
+    std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
     const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.h b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
index d9bcd859dce794..2ab19703778461 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.h
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.h
@@ -54,7 +54,7 @@ Status PinArgsAndRets(const std::vector<string>& input_devices,
 // Outputs graph optimization result after all the graph optimization (up till
 // before graph partitioning); returns error if optimization fails. Note that
 // the `input_lib_def` will be used only if the lib_def in `options` is nullptr.
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
+absl::StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -67,7 +67,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 // the file cache if existent. If cache loading fails, it goes ahead and runs
 // the graph optimization passes. Returns error if running the optimization
 // passes fails.
-StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizeFunctionGraphOrReadFromFileCache(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const DeviceSet& dev_set, const FunctionLibraryDefinition* input_lib_def,
@@ -78,7 +79,8 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
 // Pre-processes, partitions and post-optimizes the input graph; returns
 // subgraph result (maps from device name to the subgraph); returns error if any
 // optimization or partitioning step fails.
-StatusOr<std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
+absl::StatusOr<
+    std::unique_ptr<std::unordered_map<string, std::unique_ptr<Graph>>>>
 PreprocessAndPartitionGraph(
     const std::string& function_name,
     OptimizedFunctionGraphInfo& input_optimized_graph,
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
index e6d93b95658413..b2cf3d13e78766 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
@@ -65,7 +65,7 @@ void TestOptimizeFunctionGraphWithFunctionNotFound(bool load_from_cache) {
 
   // Try to optimize a function called "FindDevice" which does not exist in
   // library.
-  StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info;
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info;
   if (load_from_cache) {
     optimized_function_graph_info = OptimizeFunctionGraphOrReadFromFileCache(
         "FindDevice", {}, opts, device_set, lib_def.get(),
@@ -109,10 +109,11 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
     device_set.AddDevice(device.get());
   }
 
-  const StatusOr<OptimizedFunctionGraphInfo> aot_result = OptimizeFunctionGraph(
-      "FindDevice", {}, opts, device_set, lib_def.get(),
-      /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
-      Env::Default(), OptimizedFunctionGraph::AOT);
+  const absl::StatusOr<OptimizedFunctionGraphInfo> aot_result =
+      OptimizeFunctionGraph("FindDevice", {}, opts, device_set, lib_def.get(),
+                            /*composite_devices=*/{}, devices[0].get(),
+                            devices[1].get(), Env::Default(),
+                            OptimizedFunctionGraph::AOT);
   TF_EXPECT_OK(aot_result.status());
   EXPECT_EQ(aot_result->name, "FindDevice");
   // FindDevice function has one return node.
@@ -166,7 +167,7 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
             0);
 
   // Expect no caching with an extremely high caching threshold.
-  StatusOr<OptimizedFunctionGraphInfo> optimized_info =
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_info =
       OptimizeFunctionGraphOrReadFromFileCache(
           "FindDevice_1234", {}, opts, device_set, lib_def.get(),
           /*composite_devices=*/{}, devices[0].get(), devices[1].get(),
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.cc b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
index 03314351fb15c5..388c484954b690 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.cc
@@ -42,8 +42,8 @@ OptimizedFunctionGraph OptimizedFunctionGraphInfo::ToProto(
   return proto;
 }
 
-StatusOr<OptimizedFunctionGraphInfo> OptimizedFunctionGraphInfo::FromProto(
-    OptimizedFunctionGraph&& proto) {
+absl::StatusOr<OptimizedFunctionGraphInfo>
+OptimizedFunctionGraphInfo::FromProto(OptimizedFunctionGraph&& proto) {
   // Reconstruct the lib_def.
   FunctionLibraryDefinition lib_def(OpRegistry::Global());
   FunctionDefLibrary proto_library;
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info.h b/tensorflow/core/common_runtime/optimized_function_graph_info.h
index 90008571de02fc..b2bd9af5bb1c5a 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info.h
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info.h
@@ -80,7 +80,7 @@ struct OptimizedFunctionGraphInfo {
 
   // Converts from the proto to struct OptimizedFunctionGraphInfo. Returns error
   // if the conversion fails.
-  static StatusOr<OptimizedFunctionGraphInfo> FromProto(
+  static absl::StatusOr<OptimizedFunctionGraphInfo> FromProto(
       OptimizedFunctionGraph&& proto);
 };
 
diff --git a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
index a7c03b13382453..800da5f50d6297 100644
--- a/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
+++ b/tensorflow/core/common_runtime/optimized_function_graph_info_test.cc
@@ -87,7 +87,8 @@ constexpr absl::string_view kLibraryPb =
          })pb";
 
 // Creates a simple graph with one trivial node.
-StatusOr<OptimizedFunctionGraphInfo> CreateSimpleOptimizedFunctionGraphInfo() {
+absl::StatusOr<OptimizedFunctionGraphInfo>
+CreateSimpleOptimizedFunctionGraphInfo() {
   NodeDef node_def;
   TF_RETURN_IF_ERROR(NodeDefBuilder("A", "OneOutput").Finalize(&node_def));
   auto graph = std::make_unique<Graph>(OpRegistry::Global());
@@ -162,7 +163,7 @@ TEST(OptimizedFunctionGraphUtilsTest, FromProtoProducesCorrectResult) {
           kLibraryPb),
       &proto);
 
-  const StatusOr<OptimizedFunctionGraphInfo> test_result =
+  const absl::StatusOr<OptimizedFunctionGraphInfo> test_result =
       OptimizedFunctionGraphInfo::FromProto(std::move(proto));
   TF_EXPECT_OK(test_result.status());
   // Compare graph.
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index c6a95632ee3b74..4110f956f74599 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -143,7 +143,7 @@ Status PartitionFunctionGraph(
   return absl::OkStatus();
 }
 
-StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
     const DeviceSet& device_set, std::unique_ptr<Graph> graph) {
   // Skip transfer op insertion if the graph nodes are not assigned to multiple
   // devices.
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
index f62633996ecf99..2fff5dc96a572e 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.h
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -45,7 +45,7 @@ Status PartitionFunctionGraph(
 // Note that, the returned graph is intended to be used by TF MLIR importer.
 // The dependencies between send/recv pairs ensure the importer will generate TF
 // MLIR ops in a valid order.
-StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
+absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
     const DeviceSet& device_set, std::unique_ptr<Graph> graph);
 
 // This function performs bookkeeping to track which `Arg` and `Retval` nodes
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e124d89c140b08..b1d491fa2fcd87 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "tensorflow/core/util/reffed_status_callback.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
 #endif  // IS_MOBILE_PLATFORM
@@ -573,7 +573,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     }
   }
 
-  StatusOr<OptimizedFunctionGraphInfo> optimized_graph_info =
+  absl::StatusOr<OptimizedFunctionGraphInfo> optimized_graph_info =
       (!optimized_graph_proto.has_value() ||
        !optimized_graph_proto.value().ok())
           ? OptimizeFunctionGraphOrReadFromFileCache(
@@ -615,7 +615,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // We must preserve control returns in each of the function components,
   // otherwise after function inlining we might prune side-effectful nodes.
   const auto control_ret =
-      [&node_name_to_control_ret](const Node* n) -> absl::optional<string> {
+      [&node_name_to_control_ret](const Node* n) -> std::optional<string> {
     const auto it = node_name_to_control_ret.find(n->name());
     return it != node_name_to_control_ret.end()
                // NOLINTNEXTLINE
@@ -684,7 +684,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     }
 
     FunctionDef shard;
-    s = GraphToFunctionDef(*subgraph, comp_data->name, control_ret, &shard);
+    s = GraphToFunctionDef(std::move(subgraph), comp_data->name, control_ret,
+                           &shard);
     if (!s.ok()) {
       done(s);
       return;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index eef20006684e6f..9a948cc8d32c71 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tsl/platform/protobuf.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -1243,7 +1244,8 @@ class SessionMetadataReaderOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->allocate_output("y", TensorShape({}), &out_tensor));
     if (ctx->session_metadata() != nullptr) {
-      out_tensor->scalar<tstring>()() = ctx->session_metadata()->DebugString();
+      out_tensor->scalar<tstring>()() =
+          tsl::LegacyUnredactedDebugString(*ctx->session_metadata());
     } else {
       out_tensor->scalar<tstring>()() = "";
     }
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 3ad86d8792bc46..fe1c15465810db 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -124,8 +124,8 @@ Status update_inferred_type(Node* target, const FullTypeDef& t, bool& updated) {
   return absl::OkStatus();
 }
 
-StatusOr<FullTypeDef> run_inference(const string& fn_name,
-                                    const TypeRefVector& in_types) {
+absl::StatusOr<FullTypeDef> run_inference(const string& fn_name,
+                                          const TypeRefVector& in_types) {
   // TODO(b/224776031): Things remaining to implement:
   //  * look up function by name
   //  * execute pass on its graph
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 0816237fb0e1d6..c5191a995b60a2 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -181,7 +181,7 @@ tf_cc_test(
         "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/container:flat_hash_set",
         "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/util:determinism_test_util",
+        "@local_xla//xla/tsl/util:determinism_test_util",
     ],
 )
 
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 0118dc86b52fe0..91ee86ad1b2211 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -787,7 +787,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
   return absl::OkStatus();
 }
 
-Status CopyBatch(CopyBatchParams params,
+Status CopyBatch(AnyContext ctx,
                  std::vector<std::vector<Tensor>>&& batch_elements,
                  bool parallel_copy, std::vector<Tensor>* out_tensors) {
   const size_t num_tuple_components = batch_elements.at(0).size();
@@ -799,7 +799,7 @@ Status CopyBatch(CopyBatchParams params,
     TensorShape first_element_shape(first_element.shape());
     TensorShape batch_component_shape({num_batch_elements});
     batch_component_shape.AppendShape(first_element_shape);
-    out_tensors->emplace_back(params.allocator, first_element.dtype(),
+    out_tensors->emplace_back(ctx.allocator, first_element.dtype(),
                               batch_component_shape);
     if (!out_tensors->back().IsInitialized()) {
       return errors::ResourceExhausted(
@@ -837,7 +837,7 @@ Status CopyBatch(CopyBatchParams params,
     if (parallel_copy && total_bytes >= (1 << 20)) {
       Status status;
       mutex status_mu;
-      const auto num_threads = params.runner_threadpool_size;
+      const auto num_threads = ctx.runner_threadpool_size;
       const auto slice_size = num_batch_elements / num_threads;
       int64_t offset = 0;
       BlockingCounter counter(num_threads);
@@ -847,8 +847,8 @@ Status CopyBatch(CopyBatchParams params,
         // evenly, the size of some slices is incremented to guarantee their
         // sizes add up to the total number of elements.
         if (i < num_batch_elements % num_threads) ++length;
-        (*params.runner)([offset, length, &status, &status_mu, &counter,
-                          &copy_element_fn]() {
+        (*ctx.runner)([offset, length, &status, &status_mu, &counter,
+                       &copy_element_fn]() {
           Status s;
           for (size_t j = offset; j < offset + length; ++j) {
             s.Update(copy_element_fn(j));
diff --git a/tensorflow/core/data/dataset_utils.h b/tensorflow/core/data/dataset_utils.h
index b5089b965b7611..78fac87b213985 100644
--- a/tensorflow/core/data/dataset_utils.h
+++ b/tensorflow/core/data/dataset_utils.h
@@ -279,25 +279,6 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
                     IteratorContext* ctx, std::vector<Tensor>* output,
                     bool* end_of_sequence, std::vector<Tensor>* batch);
 
-// Constructs and stores the parameters for the CopyBatch function.
-struct CopyBatchParams {
-  Allocator* allocator;
-  std::function<void(std::function<void()>)>* runner;
-  int64 runner_threadpool_size;
-
-  explicit CopyBatchParams(IteratorContext* ctx) {
-    allocator = ctx->allocator({});
-    runner = ctx->runner();
-    runner_threadpool_size = ctx->runner_threadpool_size();
-  }
-
-  explicit CopyBatchParams(OpKernelContext* ctx) {
-    allocator = ctx->get_allocator({});
-    runner = ctx->runner();
-    runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
-  }
-};
-
 // Copies the input elements to a batch.
 //
 // The `batch_elements` argument contains the individual elements to copy into a
@@ -305,7 +286,7 @@ struct CopyBatchParams {
 // copy.
 // The `out_tensors` argument will be used to store the resulting batch (one for
 // each component of the input).
-Status CopyBatch(CopyBatchParams params,
+Status CopyBatch(AnyContext ctx,
                  std::vector<std::vector<Tensor>>&& batch_elements,
                  bool parallel_copy, std::vector<Tensor>* out_tensors);
 
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 853f5e6c5c0bfa..e581f6e3cbe3e8 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "xla/tsl/util/determinism_test_util.h"
 #include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/serialization_utils.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/work_sharder.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/util/determinism_test_util.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/data/global_shuffle_utils.cc b/tensorflow/core/data/global_shuffle_utils.cc
index 97a826ec7e9ac7..dd792763875be1 100644
--- a/tensorflow/core/data/global_shuffle_utils.cc
+++ b/tensorflow/core/data/global_shuffle_utils.cc
@@ -60,7 +60,8 @@ absl::Status GlobalShuffleIterator::GetNext(IteratorContext* ctx,
 
   absl::MutexLock l(&mu_);
   int64_t output_index = ctx->index_mapper()(element_count_++);
-  absl::Status status = dataset_->Get(output_index, out_tensors);
+  absl::Status status =
+      dataset_->Get(AnyContext(ctx), output_index, out_tensors);
   if (absl::IsOutOfRange(status)) {
     *end_of_sequence = true;
     return absl::OkStatus();
diff --git a/tensorflow/core/data/rewrite_utils_test.cc b/tensorflow/core/data/rewrite_utils_test.cc
index 21132e7682063f..0f075dc62c427b 100644
--- a/tensorflow/core/data/rewrite_utils_test.cc
+++ b/tensorflow/core/data/rewrite_utils_test.cc
@@ -48,8 +48,8 @@ NodeDef GetMapNode(absl::string_view name, absl::string_view input_node_name,
       name, /*op=*/"MapDataset", {std::string(input_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(std::string(function_name))},
        {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-       {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}});
+       {"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+       {"output_types", absl::Span<const DataType>{DT_INT64}}});
 }
 
 FunctionDef XTimesX() {
@@ -71,8 +71,8 @@ GraphDef GetRangeSquareDatasetDef(const int64_t range) {
        NDef("step", "Const", /*inputs=*/{},
             {{"value", AsScalar<int64_t>(1)}, {"dtype", DT_INT64}}),
        NDef("range", "RangeDataset", /*inputs=*/{"start", "stop", "step"},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>{TensorShape()}},
-             {"output_types", gtl::ArraySlice<DataType>{DT_INT64}}}),
+            {{"output_shapes", absl::Span<const TensorShape>{TensorShape()}},
+             {"output_types", absl::Span<const DataType>{DT_INT64}}}),
        GetMapNode("map", "range", "XTimesX"),
        NDef("dataset", "_Retval", /*inputs=*/{"map"},
             {{"T", DT_VARIANT}, {"index", 0}})},
diff --git a/tensorflow/core/data/service/client/BUILD b/tensorflow/core/data/service/client/BUILD
index edc9244e290f00..16bd0efd808e78 100644
--- a/tensorflow/core/data/service/client/BUILD
+++ b/tensorflow/core/data/service/client/BUILD
@@ -1,9 +1,9 @@
 # tf.data service client library.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 744b8042a3433d..7bba141740292a 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -206,8 +206,10 @@ void DataServiceClient::Cancel() TF_LOCKS_EXCLUDED(mu_) {
 TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
   TraceMeMetadata result;
   int64_t num_tasks = -1;
+  int64_t autotuned_max_outstanding_requests = model::kAutotune;
   if (mu_.try_lock()) {
     num_tasks = tasks_.size() - finished_tasks_;
+    autotuned_max_outstanding_requests = max_outstanding_requests_;
     mu_.unlock();
   }
   result.push_back(std::make_pair(
@@ -220,6 +222,12 @@ TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
       "max_outstanding_requests",
       strings::Printf(
           "%lld", static_cast<long long>(params_.max_outstanding_requests))));
+  if (params_.max_outstanding_requests == model::kAutotune) {
+    result.push_back(std::make_pair(
+        "autotuned_max_outstanding_requests",
+        strings::Printf("%lld", static_cast<long long>(
+                                    autotuned_max_outstanding_requests))));
+  }
   return result;
 }
 
@@ -392,12 +400,12 @@ DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
       return CreateAlternativeWorkerClientWithGrpcFallback(*transfer_server,
                                                            task_info);
     }
-    LOG(INFO) << "Failed to find transfer server for default data transfer "
-                 "protocol '"
-              << default_protocol << "' for worker '"
-              << task_info.worker_address()
-              << "'; falling back to grpc. Original error: "
-              << transfer_server.status();
+    VLOG(1) << "Failed to find transfer server for default data transfer "
+               "protocol '"
+            << default_protocol << "' for worker '"
+            << task_info.worker_address()
+            << "'; falling back to grpc. Original error: "
+            << transfer_server.status();
     metrics::RecordTFDataServiceDataTransferProtocolFallback(
         default_protocol, error::Code::NOT_FOUND,
         "Failed to find transfer server for default protocol");
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 4f145b270db425..717699dc1b21b0 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -1,9 +1,9 @@
 # Distributed snapshot library.
 
+load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies", "tf_kernel_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_profiler_service")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 9958819a1f179a..6c8211537271f9 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -13,8 +13,6 @@
 # ":debug_node_key" - Defines a struct used for tracking tensors.
 
 # Placeholder: load py_proto_library
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -24,6 +22,7 @@ load(
     "tf_copts",
     "tf_cuda_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 
 # For platform specific build config
 load(
@@ -31,6 +30,7 @@ load(
     "tf_additional_all_protos",
     "tf_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 4a4c234f56b943..c87c153ba61d28 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -3,7 +3,6 @@
 # to be distributed and performed in parallel across multiple
 # processes.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
@@ -21,6 +20,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index b4706d29ddbffa..253393afd778d5 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -49,17 +49,18 @@ cc_library(
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         "//tensorflow/core:framework_lite",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
+        "@local_tsl//tsl/protobuf:coordination_service_proto_cc",
     ],
 )
 
@@ -74,9 +75,11 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/distributed_runtime:call_options",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_client",
         "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index 952d38d852560f..61083bf86bcfe9 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -20,14 +20,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
index 3254a5d3369f1e..b0612fc2ee9770 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tsl/protobuf/coordination_service.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 464c82bfca3fd0..213c187cd75214 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -24,12 +24,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tsl/distributed_runtime/call_options.h"
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 3c5d53cb0dc6b1..9349ae54a15052 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -113,7 +113,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
 
 void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
-    FunctionLibraryRuntime::LocalHandle handle, gtl::ArraySlice<Tensor> args,
+    FunctionLibraryRuntime::LocalHandle handle, absl::Span<const Tensor> args,
     std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
   std::vector<FunctionArg> function_args;
   for (const auto& tensor : args) {
@@ -143,7 +143,7 @@ void EagerClusterFunctionLibraryRuntime::Run(
 void EagerClusterFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::LocalHandle handle,
-    gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+    absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index ed0cb32348f3d5..58af5ed93ae8ac 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -64,7 +64,7 @@ class EagerClusterFunctionLibraryRuntime
   // (i.e., the done callbacks triggered) before finishing its execution.
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+           absl::Span<const Tensor> args, std::vector<Tensor>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   // The component function inputs `args` and outputs `rets` may refer to remote
@@ -72,7 +72,7 @@ class EagerClusterFunctionLibraryRuntime
   // the inputs/outputs are actually consumed.
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
-           gtl::ArraySlice<FunctionArg> args, std::vector<FunctionRet>* rets,
+           absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
   void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 042e485394f764..d415ca1123780c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -45,7 +45,7 @@ Status WithErrorSourcePayload(Status error) {
 namespace eager {
 
 void RemoteMgr::AddOperationOutputs(
-    const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
+    const absl::Span<tensorflow::TensorHandle* const> handles,
     int64_t operation_id) {
   mutex_lock l(remote_tensor_handle_mu_);
   for (int i = 0, end = handles.size(); i < end; i++) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 06893b54399d7f..8a2e9ea61ad27f 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -43,7 +43,7 @@ class RemoteMgr {
   bool IsMaster() { return is_master_; }
 
   void AddOperationOutputs(
-      const gtl::ArraySlice<tensorflow::TensorHandle*> handles,
+      const absl::Span<tensorflow::TensorHandle* const> handles,
       int64_t operation_id);
 
   void AddOperationOutput(tensorflow::TensorHandle* handles,
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 04b4a3abdb2160..9ceccd7ccfa015 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -1,7 +1,6 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_google",
@@ -21,6 +20,7 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 37cc64b77409a0..84bce367bdd389 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index d5eacc7e6a16cd..42f0d3ea79c274 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -198,6 +198,13 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   VLOG(3) << "Grpc Server Init Definition: " << server_def_.DebugString();
   ConfigProto config = server_def_.default_session_config();
   sess_opts.config = config;
+  // Allow creation of PjRt client that knows about remote devices so
+  // collectives will work with MultiWorkerMirroredStrategy.
+  sess_opts.config.mutable_gpu_options()
+      ->mutable_experimental()
+      ->set_populate_pjrt_gpu_client_creation_info(true);
+  sess_opts.config.mutable_gpu_options()->mutable_experimental()->set_node_id(
+      server_def_.task_index());
 
   // Configure shared devices between master and worker.
   string name_prefix = strings::StrCat("/job:", server_def_.job_name(),
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index 2a1b70a5350e82..4527044f383921 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -1,5 +1,3 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@io_bazel_rules_closure//closure:defs.bzl",
     "closure_proto_library",
@@ -9,12 +7,14 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_jspb_proto_library",
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index b5f24419245868..15db744c0b2201 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -875,7 +875,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/framework:device_type",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index d0b9743b112e19..6c841bb4768435 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -835,7 +835,8 @@ Status DatasetBase::Get(OpKernelContext* ctx, int64 index,
                                DebugString());
 }
 
-Status DatasetBase::Get(int64 index, std::vector<Tensor>* out_tensors) const {
+Status DatasetBase::Get(AnyContext ctx, int64 index,
+                        std::vector<Tensor>* out_tensors) const {
   return errors::Unimplemented("Random access is not implemented for dataset ",
                                DebugString());
 }
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b1cc9e7514aa2b..ab823a5cf9fec0 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -979,6 +979,26 @@ class IteratorContext {
   MemoryCheckpoint checkpoint_;
 };
 
+// Generic context that can be constructed with either an `OpKernelContext` or
+// `IteratorContext`.
+struct AnyContext {
+  Allocator* allocator;
+  std::function<void(std::function<void()>)>* runner;
+  int64_t runner_threadpool_size;
+
+  explicit AnyContext(IteratorContext* ctx) {
+    allocator = ctx->allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = ctx->runner_threadpool_size();
+  }
+
+  explicit AnyContext(OpKernelContext* ctx) {
+    allocator = ctx->get_allocator({});
+    runner = ctx->runner();
+    runner_threadpool_size = GetRunnerThreadpoolSizeFromOpKernelContext(ctx);
+  }
+};
+
 // Represents the current position in a range of outputs, where the
 // range of outputs is typically represented by an `DatasetBase`,
 // defined below.
@@ -1346,9 +1366,11 @@ class DatasetBase : public core::RefCounted {
   virtual Status Get(OpKernelContext* ctx, int64 index,
                      std::vector<Tensor>* out_tensors) const;
 
-  // Same as above, but without an `OpKernelContext`. Used to support datasets
+  // Same as above, but with an `AnyContext`, which can be constructed from
+  // either an `OpKernelContext` or `IteratorContext`. Used to support datasets
   // that provide random access through both the dataset and iterator APIs.
-  virtual Status Get(int64 index, std::vector<Tensor>* out_tensors) const;
+  virtual Status Get(AnyContext ctx, int64 index,
+                     std::vector<Tensor>* out_tensors) const;
 
   // Returns true if the dataset and its inputs support random access.
   virtual absl::Status RandomIndexingCompatible() const {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 5a63dc61f019ac..0b6bacd94af0d9 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1820,9 +1820,12 @@ namespace {
 
 constexpr char kApiImplements[] = "api_implements";
 
-std::set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+template <typename NodeType, typename NodeIter, typename OpTypeGetter,
+          typename AttrGetter>
+std::set<string> ReachableFunctions(const FunctionLibraryDefinition& flib,
+                                    NodeIter begin, NodeIter end,
+                                    OpTypeGetter op_type_getter,
+                                    AttrGetter attr_getter) {
   // Functions that are reachable from the graph.
   std::set<string> reachable_funcs;
 
@@ -1860,31 +1863,33 @@ std::set<string> ReachableFunctions(
     }
   };
 
+  const auto process_attr_value = [&](const AttrValue& attr_value) {
+    // 1. AttrValue.func
+    if (attr_value.has_func()) {
+      add_to_func_queue(attr_value.func().name());
+    }
+
+    // 2. AttrValue.ListValue.func
+    if (attr_value.has_list()) {
+      for (const auto& func : attr_value.list().func()) {
+        add_to_func_queue(func.name());
+      }
+    }
+  };
+
   // Add all the functions that are reachable from the given node to the queue.
-  const auto process_node = [&](const NodeDef& node) {
+  const auto process_node = [&](NodeType node) {
     // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
+    add_to_func_queue(op_type_getter(node));
 
     // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
+    for (const auto& attr : attr_getter(node)) {
+      process_attr_value(attr.second);
     }
   };
 
   // Add all functions that are directly called from the optimized graph.
-  std::for_each(nodes.begin(), nodes.end(), process_node);
+  std::for_each(begin, end, process_node);
 
   // Process all reachable functions.
   while (!func_queue.empty()) {
@@ -1901,7 +1906,18 @@ std::set<string> ReachableFunctions(
 
     // Find all the functions called from the function body.
     const auto& func_body = func->fdef().node_def();
-    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    const auto process_node_def = [&](const NodeDef node) {
+      // Node itself can be a call to the function.
+      add_to_func_queue(node.op());
+
+      // Or node can have an attribute referencing a function.
+      for (const auto& attr : node.attr()) {
+        process_attr_value(attr.second);
+      }
+    };
+
+    std::for_each(func_body.begin(), func_body.end(), process_node_def);
 
     // Check if the function has a registered gradient.
     const string grad_func_name = flib.FindGradient(func_name);
@@ -1911,10 +1927,13 @@ std::set<string> ReachableFunctions(
   return reachable_funcs;
 }
 
+template <typename NodeType, typename NodeIter, typename OpTypeGetter,
+          typename AttrGetter>
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  std::set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+    const FunctionLibraryDefinition& flib, NodeIter begin, NodeIter end,
+    OpTypeGetter op_type_getter, AttrGetter attr_getter) {
+  std::set<string> reachable_funcs = ReachableFunctions<NodeType>(
+      flib, begin, end, op_type_getter, attr_getter);
 
   FunctionLibraryDefinition reachable_flib(flib.default_registry(),
                                            FunctionDefLibrary());
@@ -1961,12 +1980,26 @@ const char* IsSet(void* ptr) { return ptr == nullptr ? "unset" : "set"; }
 
 FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
     const GraphDef& graph) const {
-  return ReachableFunctionLibraryDefinition(*this, graph.node());
+  return ReachableFunctionLibraryDefinition<const NodeDef&>(
+      *this, graph.node().begin(), graph.node().end(),
+      [](const NodeDef& ndef) { return ndef.op(); },
+      [](const NodeDef& ndef) { return ndef.attr(); });
 }
 
 FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
     const FunctionDef& func) const {
-  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+  return ReachableFunctionLibraryDefinition<const NodeDef&>(
+      *this, func.node_def().begin(), func.node_def().end(),
+      [](const NodeDef& ndef) { return ndef.op(); },
+      [](const NodeDef& ndef) { return ndef.attr(); });
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const Graph& graph) const {
+  return ReachableFunctionLibraryDefinition<const Node*>(
+      *this, graph.nodes().begin(), graph.nodes().end(),
+      [](const Node* node) { return node->type_string(); },
+      [](const Node* node) { return node->attrs(); });
 }
 
 absl::StatusOr<FunctionLibraryDefinition>
@@ -1975,7 +2008,10 @@ FunctionLibraryDefinition::ReachableDefinitions(
   auto* func = Find(function_name);
   if (func) {
     FunctionLibraryDefinition ret =
-        ReachableFunctionLibraryDefinition(*this, func->node_def());
+        ReachableFunctionLibraryDefinition<const NodeDef&>(
+            *this, func->node_def().begin(), func->node_def().end(),
+            [](const NodeDef& ndef) { return ndef.op(); },
+            [](const NodeDef& ndef) { return ndef.attr(); });
     TF_RETURN_IF_ERROR(ret.CopyFunctionDefFrom(function_name, *this));
     return ret;
   } else {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index af956ac1524427..eb74ea58905405 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -569,6 +569,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // reachable from the nodes of `graph` or `func`.
   FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
   FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+  FunctionLibraryDefinition ReachableDefinitions(const Graph& graph) const;
   absl::StatusOr<FunctionLibraryDefinition> ReachableDefinitions(
       const std::string& function_name) const;
 
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 9b2d2d483bcda7..384d9cba6865a2 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -156,7 +159,7 @@ Status FillFunctionBody(
     const std::vector<const Node*>& body_nodes,
     const absl::flat_hash_map<string, string>& tensor_renaming,
     bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
-    FunctionDef* fdef) {
+    bool allow_destructive_reads, FunctionDef* fdef) {
   absl::flat_hash_set<string> func_attr_names;
   for (const auto& func_attr : fdef->signature().attr()) {
     func_attr_names.insert(func_attr.name());
@@ -166,18 +169,45 @@ Status FillFunctionBody(
   std::vector<const Edge*> control_edges;
   for (const Node* node : body_nodes) {
     NodeDef* node_def = fdef->add_node_def();
+    NodeDebugInfo debug_info(node->def());
     // First, copy the node_def as is. We will patch it next.
-    *node_def = node->def();
+    if (allow_destructive_reads) {
+      // TODO(b/327983931): Add static_assert to catch the case where fields are
+      // added to `NodeDef`.
+      Node* mutable_node = const_cast<Node*>(node);
+      // NOTE: `node_def->name()`, `node_def->device()` and `node_def->input()`
+      // are set below.
+      *node_def->mutable_op() =
+          node->def()
+              .op();  // Must be retained for access via `Node::type_string()`.
+      // Do not copy `input` or `device` because these are overwritten below.
+      // After this point, the other fields of `dst->def()` should no longer be
+      // accessed.
+      *node_def->mutable_attr() =
+          std::move(*mutable_node->mutable_def()->mutable_attr());
+
+      if (node->def().has_experimental_debug_info()) {
+        *node_def->mutable_experimental_debug_info() = std::move(
+            *mutable_node->mutable_def()->mutable_experimental_debug_info());
+      }
+      if (node->def().has_experimental_type()) {
+        *node_def->mutable_experimental_type() = std::move(
+            *mutable_node->mutable_def()->mutable_experimental_type());
+      }
+    } else {
+      *node_def = node->def();
+      MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
+
+      // Input names must be set based on nested names in tensor_renaming.
+      // Clear the flat input names we got from the original node_def
+      // from the graph.
+      node_def->clear_input();
+    }
+
     if (!node->assigned_device_name().empty()) {
       node_def->set_device(node->assigned_device_name());
     }
     node_def->set_name(node_names.Lookup(node->name()));
-    MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
-
-    // Input names must be set based on nested names in tensor_renaming.
-    // Clear the flat input names we got from the original node_def
-    // from the graph.
-    node_def->clear_input();
 
     // Collect regular and control inputs. Regular inputs are indexed
     // by the index at which they come into the `node`. Control inputs
@@ -202,7 +232,7 @@ Status FillFunctionBody(
     // Add regular inputs.
     for (size_t i = 0; i < in_edges.size(); ++i) {
       const Edge* edge = in_edges[i];
-      string original_input_name;
+      std::string original_input_name;
       if (edge == nullptr) {
         // A backedge might not appear as a regular Edge, but be only present
         // in the node_def. Such edges are referred to as requested_inputs().
@@ -255,20 +285,20 @@ Status FillFunctionBody(
     if (!copy_placeholder_attrs_from_nodes) {
       continue;
     }
-    for (const auto& iter : node->attrs()) {
+    for (const auto& iter : node_def->attr()) {
       if (iter.second.placeholder().empty()) {
         continue;
       }
 
       // If we already added the attribute, skip it.
-      string func_attr_name = iter.second.placeholder();
+      const std::string& func_attr_name = iter.second.placeholder();
       if (func_attr_names.find(func_attr_name) != func_attr_names.end()) {
         continue;
       }
 
       // This node's attribute is a placeholder value, so it does not have type
       // information. We check node's OpDef for attribute type.
-      string node_attr_name = iter.first;
+      const std::string& node_attr_name = iter.first;
       const OpDef::AttrDef* node_attr_def = nullptr;
       for (const auto& node_attr : node->op_def().attr()) {
         if (node_attr.name() == node_attr_name) {
@@ -292,91 +322,15 @@ Status FillFunctionBody(
 }
 
 Status GraphToFunctionDefHelper(
-    const Graph& graph, const string& name,
-    const std::function<absl::optional<string>(const Node*)>& control_ret,
-    const std::vector<string>& output_names, FunctionDef* fdef) {
-  auto add_arg_or_retval = [](Node* node,
-                              std::vector<OutputTensor>* args_or_retvals) {
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
-    if (index >= args_or_retvals->size()) {
-      args_or_retvals->resize(index + 1);
-    }
-    if ((*args_or_retvals)[index].node == nullptr) {
-      (*args_or_retvals)[index].node = node;
-    } else {
-      return errors::InvalidArgument(
-          "Multiple '", node->type_string(), "' nodes found with index ", index,
-          "; originally we already have:\n",
-          (*args_or_retvals)[index].node->DebugString(), "\nNow we have:\n",
-          node->DebugString());
-    }
-    return OkStatus();
-  };
-
-  std::vector<const Node*> body_nodes;
-  std::vector<OutputTensor> inputs;
-  std::vector<OutputTensor> outputs;
-  std::vector<const Node*> control_outputs;
-  std::vector<string> control_output_names;
-  for (Node* node : graph.op_nodes()) {
-    if (node->IsArg()) {
-      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
-      continue;
-    }
-
-    if (node->IsRetval()) {
-      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &outputs));
-      continue;
-    }
-
-    if (control_ret) {
-      auto control_ret_name = control_ret(node);
-      if (control_ret_name.has_value()) {
-        control_outputs.push_back(node);
-        control_output_names.push_back(control_ret_name.value());
-      }
-    }
-
-    body_nodes.push_back(node);
-  }
-
-  auto validate_args_retvals =
-      [](const std::vector<OutputTensor>& args_or_retvals,
-         const string& op_type) {
-        for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
-          if (args_or_retvals[i].node == nullptr) {
-            return errors::InvalidArgument("Missing '", op_type,
-                                           "' node at index ", i);
-          }
-        }
-        return OkStatus();
-      };
-
-  TF_RETURN_IF_ERROR(validate_args_retvals(inputs, "_Arg"));
-  TF_RETURN_IF_ERROR(validate_args_retvals(outputs, "_Retval"));
-
-  return GraphToFunctionDef(graph, name, /*append_hash_to_fn_name=*/false,
-                            /*set_stateful_from_nodes=*/false,
-                            /*copy_placeholder_attrs_from_nodes=*/false,
-                            body_nodes, inputs, outputs, output_names,
-                            control_outputs, control_output_names,
-                            /*description=*/nullptr, fdef);
-}
-
-}  // anonymous namespace
-
-Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
-                          bool append_hash_to_fn_name,
-                          bool set_stateful_from_nodes,
-                          bool copy_placeholder_attrs_from_nodes,
-                          const std::vector<const Node*>& body_nodes,
-                          const std::vector<OutputTensor>& inputs,
-                          const std::vector<OutputTensor>& outputs,
-                          const std::vector<string>& output_names,
-                          const std::vector<const Node*>& control_outputs,
-                          const std::vector<string>& control_output_names,
-                          const char* description, FunctionDef* fdef) {
+    const Graph& fn_body, const string& fn_name, bool append_hash_to_fn_name,
+    bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
+    const std::vector<const Node*>& body_nodes,
+    const std::vector<OutputTensor>& inputs,
+    const std::vector<OutputTensor>& outputs,
+    const std::vector<string>& output_names,
+    const std::vector<const Node*>& control_outputs,
+    const std::vector<string>& control_output_names, const char* description,
+    bool allow_destructive_reads, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
   }
@@ -501,9 +455,9 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     }
   }
 
-  TF_RETURN_IF_ERROR(FillFunctionBody(fn_name, node_names, body_nodes,
-                                      tensor_renaming, set_stateful_from_nodes,
-                                      copy_placeholder_attrs_from_nodes, fdef));
+  TF_RETURN_IF_ERROR(FillFunctionBody(
+      fn_name, node_names, body_nodes, tensor_renaming, set_stateful_from_nodes,
+      copy_placeholder_attrs_from_nodes, allow_destructive_reads, fdef));
 
   // Remap return values.
   for (int r = 0; r < fdef->signature().output_arg_size(); ++r) {
@@ -585,12 +539,108 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
   return OkStatus();
 }
 
+Status GraphToFunctionDefHelper(
+    const Graph& graph, const string& name,
+    const std::function<absl::optional<string>(const Node*)>& control_ret,
+    const std::vector<string>& output_names, bool allow_destructive_reads,
+    FunctionDef* fdef) {
+  auto add_arg_or_retval = [](Node* node,
+                              std::vector<OutputTensor>* args_or_retvals) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "index", &index));
+    if (index >= args_or_retvals->size()) {
+      args_or_retvals->resize(index + 1);
+    }
+    if ((*args_or_retvals)[index].node == nullptr) {
+      (*args_or_retvals)[index].node = node;
+    } else {
+      return errors::InvalidArgument(
+          "Multiple '", node->type_string(), "' nodes found with index ", index,
+          "; originally we already have:\n",
+          (*args_or_retvals)[index].node->DebugString(), "\nNow we have:\n",
+          node->DebugString());
+    }
+    return OkStatus();
+  };
+
+  std::vector<const Node*> body_nodes;
+  std::vector<OutputTensor> inputs;
+  std::vector<OutputTensor> outputs;
+  std::vector<const Node*> control_outputs;
+  std::vector<string> control_output_names;
+  for (Node* node : graph.op_nodes()) {
+    if (node->IsArg()) {
+      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &inputs));
+      continue;
+    }
+
+    if (node->IsRetval()) {
+      TF_RETURN_IF_ERROR(add_arg_or_retval(node, &outputs));
+      continue;
+    }
+
+    if (control_ret) {
+      auto control_ret_name = control_ret(node);
+      if (control_ret_name.has_value()) {
+        control_outputs.push_back(node);
+        control_output_names.push_back(control_ret_name.value());
+      }
+    }
+
+    body_nodes.push_back(node);
+  }
+
+  auto validate_args_retvals =
+      [](const std::vector<OutputTensor>& args_or_retvals,
+         const string& op_type) {
+        for (int i = 0, e = args_or_retvals.size(); i < e; ++i) {
+          if (args_or_retvals[i].node == nullptr) {
+            return errors::InvalidArgument("Missing '", op_type,
+                                           "' node at index ", i);
+          }
+        }
+        return OkStatus();
+      };
+
+  TF_RETURN_IF_ERROR(validate_args_retvals(inputs, "_Arg"));
+  TF_RETURN_IF_ERROR(validate_args_retvals(outputs, "_Retval"));
+
+  return GraphToFunctionDefHelper(
+      graph, name, /*append_hash_to_fn_name=*/false,
+      /*set_stateful_from_nodes=*/false,
+      /*copy_placeholder_attrs_from_nodes=*/false, body_nodes, inputs, outputs,
+      output_names, control_outputs, control_output_names,
+      /*description=*/nullptr, allow_destructive_reads, fdef);
+}
+
+}  // anonymous namespace
+
+Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
+                          bool append_hash_to_fn_name,
+                          bool set_stateful_from_nodes,
+                          bool copy_placeholder_attrs_from_nodes,
+                          const std::vector<const Node*>& body_nodes,
+                          const std::vector<OutputTensor>& inputs,
+                          const std::vector<OutputTensor>& outputs,
+                          const std::vector<string>& output_names,
+                          const std::vector<const Node*>& control_outputs,
+                          const std::vector<string>& control_output_names,
+                          const char* description, FunctionDef* fdef) {
+  return GraphToFunctionDefHelper(
+      fn_body, fn_name, append_hash_to_fn_name, set_stateful_from_nodes,
+      copy_placeholder_attrs_from_nodes, body_nodes, inputs, outputs,
+      output_names, control_outputs, control_output_names, description,
+      /*allow_destructive_reads=*/false, fdef);
+  return OkStatus();
+}
+
 Status GraphToFunctionDef(
     const Graph& graph, const string& name,
     const std::function<absl::optional<string>(const Node*)>& control_ret,
     FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, control_ret,
-                                  /*output_names=*/{}, fdef);
+                                  /*output_names=*/{},
+                                  /*allow_destructive_reads=*/false, fdef);
 }
 
 Status GraphToFunctionDef(const Graph& graph, const string& name,
@@ -602,7 +652,17 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
                           const std::vector<std::string>& output_names,
                           FunctionDef* fdef) {
   return GraphToFunctionDefHelper(graph, name, /*control_ret=*/nullptr,
-                                  output_names, fdef);
+                                  output_names,
+                                  /*allow_destructive_reads=*/false, fdef);
+}
+
+Status GraphToFunctionDef(
+    std::unique_ptr<Graph> graph, const string& name,
+    const std::function<std::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef) {
+  return GraphToFunctionDefHelper(*graph, name, control_ret,
+                                  /*output_names=*/{},
+                                  /*allow_destructive_reads=*/true, fdef);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/graph_to_functiondef.h b/tensorflow/core/framework/graph_to_functiondef.h
index 834bf50acccdde..5af678645527bd 100644
--- a/tensorflow/core/framework/graph_to_functiondef.h
+++ b/tensorflow/core/framework/graph_to_functiondef.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 #define TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -60,6 +61,11 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
                           const std::vector<std::string>& output_names,
                           FunctionDef* fdef);
 
+Status GraphToFunctionDef(
+    std::unique_ptr<Graph> graph, const string& name,
+    const std::function<std::optional<string>(const Node*)>& control_ret,
+    FunctionDef* fdef);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_GRAPH_TO_FUNCTIONDEF_H_
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index d51711c8ce1c69..863f0c209513ac 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -341,7 +341,7 @@ auto* tf_data_error = tsl::monitoring::Counter<2>::New(
 
 auto* tf_data_framework_type = tsl::monitoring::Counter<1>::New(
     "/tensorflow/data/framework_type",
-    "The framework type used to build the tf.data.Dataset.", "framework_type");
+    "The framework type used to build the tf.data.Dataset.", "name");
 
 auto* parse_dense_feature_counter = tsl::monitoring::Counter<0>::New(
     "/tensorflow/data/dense_feature",
@@ -443,10 +443,10 @@ auto* eager_client_error_counter = tsl::monitoring::Counter<2>::New(
     "Count the errors in eager client as a central place.", "error_source",
     "error_type");
 
-auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<4>::New(
+auto* mlir_bridge_first_phase_counter = tsl::monitoring::Counter<5>::New(
     "/tensorflow/core/tf_mlir_bridge_first_phase_count",
-    "Tracks processing state in first phase of mlir bridge", "device",
-    "version", "fallback", "result");
+    "Tracks processing state in first phase of mlir bridge", "bridge",
+    "version", "device", "fallback", "result");
 
 auto* mlir_second_phase_count = tensorflow::monitoring::Counter<1>::New(
     "/tensorflow/core/tf2xla/api/v2/phase2_compilation_status" /*metric_name*/,
@@ -948,14 +948,16 @@ void TestDelta::Reset() { last_value_ = cell_->value(); }
 
 int64 TestDelta::Get() { return cell_->value() - last_value_; }
 
-void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
+void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
+                                         const std::string& device_type,
                                          bool fallback_enabled,
                                          const std::string& result) {
   std::string fallback_status =
       fallback_enabled ? "fallback_enabled" : "fallback_disabled";
   mlir_bridge_first_phase_counter
-      ->GetCell(device_type, bridge_version, fallback_status, result)
+      ->GetCell(bridge_type, bridge_version, device_type, fallback_status,
+                result)
       ->IncrementBy(1);
 }
 
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 955a6461bd7c36..1a6ba8a88bf890 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -341,12 +341,14 @@ int64_t GetFunctionGraphOptimizationCacheLoadCount(
 
 // Records the activity of the first phase of the mlir bridge using the
 // tf_metadata.tf_mlir_bridge_first_phase_count metric.
-// device_type: tpu, cpu, gpu, etc.
+// bridge_type: replicated, nonreplicated, etc.
 // bridge_version: v1 compat, v2, etc.
+// device_type: tpu, cpu, gpu, etc.
 // fallback_enabled: true if fallback will happen, false if not
 // result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
-void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
+void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& bridge_type,
                                          const std::string& bridge_version,
+                                         const std::string& device_type,
                                          bool fallback_enabled,
                                          const std::string& result);
 
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index f25624b41675ab..cc970218ba76cf 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace data {
@@ -2265,7 +2266,7 @@ Model::Model(std::optional<std::string> dataset_name)
               if (dataset_name_.has_value()) {
                 model_proto.set_dataset_name(dataset_name_.value());
               }
-              return model_proto.DebugString();
+              return tsl::LegacyUnredactedDebugString(model_proto);
             }
             LOG(WARNING) << s.message();
           }
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index fb477a50899601..ed4bb1bf7cdcf2 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tsl/platform/protobuf.h"
 
 class DummyKernel : public tensorflow::OpKernel {
  public:
@@ -350,15 +351,19 @@ TEST_F(OpKernelTest, NotFound) {
   const auto not_found = error::NOT_FOUND;
   // Something with that op type name exists, but only with a
   // different DeviceType.
-  ExpectFailure(CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test1", {DT_FLOAT, DT_INT32})),
                 DEVICE_GPU, not_found);
-  ExpectFailure(CreateNodeDef("Test3", {DT_INT8, DT_INT8}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_INT8, DT_INT8})),
                 DEVICE_GPU, not_found);
-  ExpectFailure(CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT})),
                 DEVICE_CPU, not_found);
 
   // No kernel with that signature registered.
-  ExpectFailure(CreateNodeDef("Test3", {DT_INT32, DT_INT32}).DebugString(),
+  ExpectFailure(tsl::LegacyUnredactedDebugString(
+                    CreateNodeDef("Test3", {DT_INT32, DT_INT32})),
                 DEVICE_GPU, not_found);
 
   // Nothing with that op type name exists.
@@ -370,23 +375,27 @@ TEST_F(OpKernelTest, TooFewInputs) {
   const auto invalid = error::INVALID_ARGUMENT;
   NodeDef node_def = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
   node_def.clear_input();
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
   node_def.add_input("a");
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
 }
 
 TEST_F(OpKernelTest, TooManyInputs) {
   const auto invalid = error::INVALID_ARGUMENT;
   NodeDef node_def = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
   node_def.add_input("c");
-  ExpectFailure(node_def.DebugString(), DEVICE_CPU, invalid);
+  ExpectFailure(tsl::LegacyUnredactedDebugString(node_def), DEVICE_CPU,
+                invalid);
 }
 
 TEST_F(OpKernelTest, MatchSignatureFails) {
   const auto invalid = error::INVALID_ARGUMENT;
   foo::match_signature_ = true;
-  ExpectFailure(CreateNodeDef("Test2", {DT_FLOAT}).DebugString(), DEVICE_GPU,
-                invalid);
+  ExpectFailure(
+      tsl::LegacyUnredactedDebugString(CreateNodeDef("Test2", {DT_FLOAT})),
+      DEVICE_GPU, invalid);
   EXPECT_FALSE(foo::match_signature_);
 }
 
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 010395a9a2c4bd..d2b0cd3efa0461 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/escaping.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/resource_handle.h"
@@ -65,7 +66,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/function/capture/BUILD b/tensorflow/core/function/capture/BUILD
index 31ecbe1a79c16f..e1307342c6d60c 100644
--- a/tensorflow/core/function/capture/BUILD
+++ b/tensorflow/core/function/capture/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 67165dfc34deca..a33eee23597e77 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -1,6 +1,7 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -17,9 +18,7 @@ pytype_strict_library(
         "type_dispatch.py",
     ],
     srcs_version = "PY3",
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":function_type",
     ],
@@ -96,18 +95,14 @@ tf_proto_library(
     protodeps = [
         "//tensorflow/core/function/trace_type:serialization_proto",
     ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 # copybara:uncomment_begin(google-only)
 # py_proto_library(
 #     name = "function_type_py_pb2",
 #     api_version = 2,
-#     visibility = [
-#         "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-#     ],
+#     visibility = ["//visibility:private"],
 #     deps = [":function_type_proto"],
 # )
 # copybara:uncomment_end
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index b3c0f2e05f8fa6..7e7acb23a29fe4 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -1,6 +1,7 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index 7016a2dde88be0..aac871128e24ed 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_cc_tests",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 79a1867155ef33..00e2e74b684509 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/graph/mkl_testlib.cc b/tensorflow/core/graph/mkl_testlib.cc
index e8955da2f1748c..52a52ab828fe54 100644
--- a/tensorflow/core/graph/mkl_testlib.cc
+++ b/tensorflow/core/graph/mkl_testlib.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ Node* oneDNNSoftmax(Graph* g, Node* input) {
   return ret;
 }
 
+#ifdef ENABLE_ONEDNN_V3
 Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b) {
   Node* ret = nullptr;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_MklNativeSparseMatrixMatMul")
@@ -42,6 +43,7 @@ Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b) {
                   .Finalize(g, &ret));
   return ret;
 }
+#endif  // ENABLE_ONEDNN_V3
 
 }  // namespace graph
 }  // namespace test
diff --git a/tensorflow/core/graph/mkl_testlib.h b/tensorflow/core/graph/mkl_testlib.h
index 1b783923c1f03c..3dffded1584465 100644
--- a/tensorflow/core/graph/mkl_testlib.h
+++ b/tensorflow/core/graph/mkl_testlib.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -26,7 +26,9 @@ namespace graph {
 
 Node* oneDNNSoftmax(Graph* g, Node* input);
 
+#ifdef ENABLE_ONEDNN_V3
 Node* oneDNNSparseCSRMatmul(Graph* g, Node* csr_matrix_t, Node* b);
+#endif  // ENABLE_ONEDNN_V3
 
 }  // namespace graph
 }  // namespace test
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 55429b50eb31fc..f5fa5de61a218e 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -333,7 +334,7 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   }
   GraphDef gdef;
   test::graph::ToGraphDef(&g, &gdef);
-  InitGraph(gdef.DebugString());
+  InitGraph(tsl::LegacyUnredactedDebugString(gdef));
 
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
diff --git a/tensorflow/core/graph/regularization/BUILD b/tensorflow/core/graph/regularization/BUILD
index 543e490dfe2bc0..4562878a3a80f3 100644
--- a/tensorflow/core/graph/regularization/BUILD
+++ b/tensorflow/core/graph/regularization/BUILD
@@ -1,12 +1,12 @@
 # Description:
 # A TensorFlow Graph Regularization Library
 
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 63894b42422be7..5515c119893436 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index c2a7a2f85322d8..ae954ee3863c3d 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,11 +1,11 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 4d40fbe5f0812e..c627c09025a8a1 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,17 +1,17 @@
 # Placeholder: load py_proto_library
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_cuda_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
     "tf_protos_grappler",
     "tf_pyclif_proto_library",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index 8b4f8c3ad0d5ba..cd77c2b35a00c1 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 73c6edc643adb6..3f2fddd7fef103 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -31,6 +31,9 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -53,15 +56,20 @@ cc_library(
     deps = [
         ":input_yielder",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:aggregate_ops",
+        "//tensorflow/core/platform:strcat",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -79,5 +87,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
     ],
 )
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index e63a38c9746ef9..e2511806ce35d8 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 9ce0284369af3c..3c72721e5099a6 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -17,11 +17,22 @@ limitations under the License.
 // and feed them as inputs to Grappler. This can be used for quick experiments
 // or to derive small regression tests.
 
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/queue_runner.pb.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
index 03f59701cefd61..580a526d0b1b3d 100644
--- a/tensorflow/core/grappler/inputs/utils.cc
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 2588e380fed231..0f69913ed6fa99 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
index 93d4fe956f8e37..ea38f6c4d8545b 100644
--- a/tensorflow/core/grappler/inputs/utils_test.cc
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -14,9 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/inputs/utils.h"
-#include "tensorflow/core/lib/io/path.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0085ca500cad97..e12fb53b24c2d7 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,6 +1,10 @@
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_kernel_library")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl")
 
 # Platform specific build config
 load(
@@ -8,10 +12,6 @@ load(
     "if_static",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 0dde08c8383fe5..4da6f454d2486e 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -1053,7 +1054,8 @@ class AutoMixedPrecisionImpl {
         num_nonvar_casts_to_f16_(0),
         mode_(mode),
         target_dtype_((mode_ == AutoMixedPrecisionMode::CUDA ||
-                       mode_ == AutoMixedPrecisionMode::CPU)
+                       mode_ == AutoMixedPrecisionMode::CPU ||
+                       mode_ == AutoMixedPrecisionMode::FP16_CPU)
                           ? DT_HALF
                           : DT_BFLOAT16) {}
 
@@ -1065,16 +1067,18 @@ class AutoMixedPrecisionImpl {
   std::unique_ptr<AutoMixedPrecisionLists> get_mixed_precision_lists() const {
     switch (mode_) {
       case AutoMixedPrecisionMode::CUDA:
-        return std::make_unique<AutoMixedPrecisionListsCuda>(cuda_version_,
-                                                             cudnn_version_);
+        return std::make_unique<AutoMixedPrecisionListsFp16>(
+            cuda_version_, cudnn_version_, AutoMixedPrecisionMode::CUDA);
       case AutoMixedPrecisionMode::BF16:
         return std::make_unique<AutoMixedPrecisionListsMkl>();
       case AutoMixedPrecisionMode::CPU:
-        // Note: this is not a typo here. AutoMixedPrecisionListsCuda is used
-        // intentionally to make CPU and GPU have the same fp16 ops.
-        return std::make_unique<AutoMixedPrecisionListsCuda>(
-            /*cuda_version=*/10000,   // Hardcode cuda and cudnn version so
-            /*cudnn_version=*/8000);  // CPU emulates the same ops on GPU.
+        return std::make_unique<AutoMixedPrecisionListsFp16>(
+            /*cuda_version=*/10000,  // Hardcode cuda and cudnn version so
+            /*cudnn_version=*/8000,  // CPU emulates the same ops on GPU.
+            AutoMixedPrecisionMode::CPU);
+      case AutoMixedPrecisionMode::FP16_CPU:
+        return std::make_unique<AutoMixedPrecisionListsFp16>(
+            0, 0, AutoMixedPrecisionMode::FP16_CPU);
     }
   }
   Status PrintDebugLogs(bool preop, size_t timestamp);
@@ -1386,9 +1390,10 @@ Status AutoMixedPrecisionImpl::Optimize() {
       "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "", &optimization_level));
   optimization_level = absl::AsciiStrToUpper(optimization_level);
   force_all_fp16_ = optimization_level == "UNSAFE_FORCE_ALL";
-  if (force_all_fp16_ && mode_ == AutoMixedPrecisionMode::BF16) {
-    // Many ops do not support bfloat16 on the CPU so we disallowing forcing to
-    // bfloat16.
+  if (force_all_fp16_ && (mode_ == AutoMixedPrecisionMode::BF16 ||
+                          mode_ == AutoMixedPrecisionMode::FP16_CPU)) {
+    // Many ops do not support bfloat16/fp16 on the CPU. So, disallowing
+    // forcing to bfloat16/fp16.
     return errors::InvalidArgument(
         "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL cannot be set to "
         "UNSAFE_FORCE_ALL when oneDNN is used");
@@ -1430,6 +1435,7 @@ Status AutoMixedPrecisionImpl::Optimize() {
         break;
       case AutoMixedPrecisionMode::BF16:
       case AutoMixedPrecisionMode::CPU:
+      case AutoMixedPrecisionMode::FP16_CPU:
         device_type = DEVICE_CPU;
         should_process = !MustPreserve(node) && IsOnDevice(node, device_type);
         break;
@@ -2301,9 +2307,15 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   int num_gpus = GetNumGPUs(*cluster);
   if (num_gpus < 1 && mode_ == AutoMixedPrecisionMode::CUDA) {
-    // AutoMixedPrecision is currently only tuned for GPU.
-    LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
-                 << " graph optimizer";
+    // No GPUs to run AutoMixedPrecision in FP16.
+    VLOG(1) << "No (suitable) GPUs detected, skipping " << name()
+            << " graph optimizer";
+    return absl::OkStatus();
+  }
+  // Check if CPU supports FP16
+  if (mode_ == AutoMixedPrecisionMode::FP16_CPU &&
+      !IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF)) {
+    VLOG(1) << "No support for " << name() << " graph optimizer on CPU";
     return absl::OkStatus();
   }
 
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
index 0807d740f1448c..c26b640765f3d4 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@@ -26,15 +26,16 @@ namespace grappler {
 // CUDA: convert to float16 on GPU
 // BF16: convert to bfloat16 on CPU
 // CPU: emulate float16 on CPU without changing operator kernel
-enum class AutoMixedPrecisionMode { CUDA, BF16, CPU };
+// FP16_CPU : convert to float16 on CPU
+enum class AutoMixedPrecisionMode { CUDA, BF16, CPU, FP16_CPU };
 
 // Convert data types to float16 or bfloat16 where appropriate to improve
 // performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
  public:
-  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If BF16,
-  // converts nodes to bfloat16 on CPUs in order to take advantage of oneDNN
-  // performance improvements with bfloat16.
+  // If 'mode' is CUDA, converts nodes to float16 on Nvidia GPUs. If BF16 or
+  // FP16_CPU, converts nodes to bfloat16/fp16 on CPUs in order to take
+  // advantage of oneDNN performance improvements with bfloat16/fp16.
   explicit AutoMixedPrecision(
       AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
       : mode_(mode) {}
@@ -49,6 +50,9 @@ class AutoMixedPrecision : public GraphOptimizer {
         return "auto_mixed_precision_onednn_bfloat16";
       case AutoMixedPrecisionMode::CPU:
         return "auto_mixed_precision_cpu";
+      case AutoMixedPrecisionMode::FP16_CPU:
+        // Note: using different name than GPU for ease of debugging.
+        return "auto_mixed_precision_onednn_float16";
       default:
         LOG(FATAL) << "Invalid value for AutoMixedPrecisionMode: "  // Crash Ok
                    << static_cast<int>(mode_);
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
index 7ed3a3c05a1999..e7cb39d0f4d7fd 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/env_var.h"
@@ -94,7 +95,7 @@ class AutoMixedPrecisionLists {
   }
 };
 
-class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
+class AutoMixedPrecisionListsFp16 : public AutoMixedPrecisionLists {
  private:
   static bool IsPseudoFastMath() {
     string optimization_level;
@@ -106,52 +107,64 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
  public:
-  AutoMixedPrecisionListsCuda(int cuda_version, int cudnn_version)
-      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {}
+  AutoMixedPrecisionListsFp16(
+      int cuda_version, int cudnn_version,
+      AutoMixedPrecisionMode mode = AutoMixedPrecisionMode::CUDA)
+      : cuda_version_(cuda_version), cudnn_version_(cudnn_version) {
+    if (mode == AutoMixedPrecisionMode::CUDA ||
+        mode == AutoMixedPrecisionMode::CPU) {
+      // Note: this is not a typo here. use_cuda_ is set to true for the CPU
+      // intentionally to make CPU and GPU have the same fp16 ops.
+      use_cuda_ = true;
+      use_onednn_ = false;
+    } else if (mode == AutoMixedPrecisionMode::FP16_CPU) {
+      use_onednn_ = true;
+      use_cuda_ = false;
+    }
+  }
 
   gtl::FlatSet<string> AllowList() override {
     auto list = gtl::FlatSet<string>{
-        "BlockLSTM",
-        "BlockLSTMV2",
-        "BlockLSTMGrad",
-        "BlockLSTMGradV2",
-        "Conv2D",
-        "Conv2DBackpropFilter",
-        "Conv2DBackpropInput",
-        "CudnnRNN",
-        "CudnnRNNBackprop",
-        "CudnnRNNBackpropV2",
-        "CudnnRNNBackpropV3",
-        "CudnnRNNV2",
-        "CudnnRNNV3",
-        "Einsum",
-        "Dropout",
-        "DropoutGrad",
-        "FusedConv2DBiasActivation",
-        "FusedSparseConvGpuV2",
-        "GRUBlockCell",
-        "GRUBlockCellGrad",
-        "LSTMBlockCell",
-        "LSTMBlockCellGrad",
+        "Conv2D", "Conv2DBackpropFilter", "Conv2DBackpropInput", "Einsum",
         "MatMul",
-        "Mha",
-        "MhaV2",
-        "Tmlp",
-        "TmlpV2",
-        "TmlpV3",
-        "Pmlp",
-        "FastUnsortedSegmentMax",
     };
+    if (use_cuda_) {
+      list.insert("BlockLSTM");
+      list.insert("BlockLSTMV2");
+      list.insert("BlockLSTMGrad");
+      list.insert("BlockLSTMGradV2");
+      list.insert("CudnnRNN");
+      list.insert("CudnnRNNBackprop");
+      list.insert("CudnnRNNBackpropV2");
+      list.insert("CudnnRNNBackpropV3");
+      list.insert("CudnnRNNV2");
+      list.insert("CudnnRNNV3");
+      list.insert("Dropout");
+      list.insert("DropoutGrad");
+      list.insert("FusedConv2DBiasActivation");
+      list.insert("FusedSparseConvGpuV2");
+      list.insert("GRUBlockCell");
+      list.insert("GRUBlockCellGrad");
+      list.insert("LSTMBlockCell");
+      list.insert("LSTMBlockCellGrad");
+      list.insert("Mha");
+      list.insert("MhaV2");
+      list.insert("Tmlp");
+      list.insert("TmlpV2");
+      list.insert("TmlpV3");
+      list.insert("Pmlp");
+      list.insert("FastUnsortedSegmentMax");
+    }
 #if TENSORFLOW_USE_ROCM
     if (true) {
 #else
-    if (cuda_version_ >= 9010) {
+    if ((use_cuda_ && cuda_version_ >= 9010) || use_onednn_) {
       // Fp16 BatchMatMul is slow before CUDA 9.1.
 #endif
       list.insert("BatchMatMul");
       list.insert("BatchMatMulV2");
     }
-    if (cudnn_version_ >= 7602) {
+    if ((use_cuda_ && cudnn_version_ >= 7602) || use_onednn_) {
       // Fp16 3D conv is slow before CUDNN 7.6.2.
       list.insert("Conv3D");
       list.insert("Conv3DBackpropFilter");
@@ -176,7 +189,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> InferList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -246,7 +259,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> DenyList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -269,7 +282,7 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
   }
 
   gtl::FlatSet<string> ClearList() override {
-    if (IsPseudoFastMath()) {
+    if (IsPseudoFastMath() && use_cuda_) {
       return gtl::FlatSet<string>{};
     }
 
@@ -378,8 +391,14 @@ class AutoMixedPrecisionListsCuda : public AutoMixedPrecisionLists {
  private:
   int cuda_version_;
   int cudnn_version_;
+  bool use_cuda_;
+  bool use_onednn_;
 };
 
+// TODO(reedwm): Remove this alias. Some Google-internal code still uses the
+// AutoMixedPrecisionListsCuda name.
+using AutoMixedPrecisionListsCuda = AutoMixedPrecisionListsFp16;
+
 class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
  public:
   AutoMixedPrecisionListsMkl() {}
@@ -427,7 +446,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "FusedBatchNormGradV3",
                                      "LeakyRelu",
                                      "LeakyReluGrad",
-                                     "Mean",
                                      "Mul",
                                      "Sub",
                                      "Elu",
@@ -453,7 +471,6 @@ class AutoMixedPrecisionListsMkl : public AutoMixedPrecisionLists {
                                      "Sqrt",
                                      "Square",
                                      "SquaredDifference",
-                                     "Sum",
                                      "Tanh",
                                      "TanhGrad"};
     UpdateList("INFERLIST", &list);
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index 689185fb08923d..0b855be91f8099 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -89,40 +89,54 @@ void VerifyGraphsEquivalent(const GraphDef& original_graph,
   }
 }
 
-// Currently, this test suite only passes when TensorFlow passes with CUDA/HIP,
-// because otherwise the optimizer will not turn clearlist nodes to float16.
-// When looking at clearlist nodes, this optimizer checks if the nodes have a
-// float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels at all.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
+// Currently on GPU, this test suite only passes when TensorFlow passes with
+// CUDA/HIP, because otherwise the optimizer will not turn clearlist nodes to
+// float16. When looking at clearlist nodes, this optimizer checks if the nodes
+// have a float16 GPU OpKernel, but without CUDA/HIP there are no GPU OpKernels
+// at all. And on CPU, this test suite passes when AMX FP16 is supported.
 const std::pair<int, int> kMinGPUArch = {7, 0};
 
 class AutoMixedPrecisionTest : public GrapplerTest {
  protected:
+  void SetMode(AutoMixedPrecisionMode mode) { mode_ = mode; }
   void SetUp() override {
-    int num_gpus = GetNumAvailableGPUs();
-    // If GPUs are available, require that they all satisfy the min arch.
-    gpu_available_ = (num_gpus > 0);
+    if (mode_ == AutoMixedPrecisionMode::CUDA) {
+      int num_gpus = GetNumAvailableGPUs();
+      // If GPUs are available, require that they all satisfy the min arch.
+      gpu_available_ = (num_gpus > 0);
 #if GOOGLE_CUDA
-    gpu_available_ =
-        gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
+      gpu_available_ =
+          gpu_available_ && (num_gpus == GetNumAvailableGPUs(kMinGPUArch));
 #else  // Here we force Tensorflow to use the virtual GFX906
-    gpu_available_ = false;
+      gpu_available_ = false;
 #endif
-    if (gpu_available_) {
-      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
-    } else {
-      DeviceProperties device_properties;
-      device_properties.set_type("GPU");
+      if (gpu_available_) {
+        virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
+      } else {
+        DeviceProperties device_properties;
+        device_properties.set_type("GPU");
 #if GOOGLE_CUDA
-      device_properties.mutable_environment()->insert({"architecture", "7"});
-      device_properties.mutable_environment()->insert({"cuda", "9010"});
+        device_properties.mutable_environment()->insert({"architecture", "7"});
+        device_properties.mutable_environment()->insert({"cuda", "9010"});
 #else
-      device_properties.mutable_environment()->insert(
-          {"architecture", "gfx906"});
+        device_properties.mutable_environment()->insert(
+            {"architecture", "gfx906"});
 #endif
-      virtual_cluster_.reset(
-          new VirtualCluster({{"/GPU:1", device_properties}}));
+        virtual_cluster_.reset(
+            new VirtualCluster({{"/GPU:1", device_properties}}));
+      }
+    } else if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
+      DeviceProperties device_properties;
+      device_properties.set_type("CPU");
+      virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 0));
+
+      bool is_fp16_enabled_on_cpu = false;
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+      is_fp16_enabled_on_cpu = IsAMXDataTypeSupportedByOneDNNOnThisCPU(DT_HALF);
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
+      if (!IsMKLEnabled() || !is_fp16_enabled_on_cpu) {
+        GTEST_SKIP() << "This device doesn't support FP16";
+      }
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
   }
@@ -189,7 +203,7 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-    AutoMixedPrecision optimizer;
+    AutoMixedPrecision optimizer(mode_);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -212,9 +226,22 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 
   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
+  AutoMixedPrecisionMode mode_;
 };
 
-TEST_F(AutoMixedPrecisionTest, NoOp) {
+class AutoMixedPrecisionParamTest
+    : public AutoMixedPrecisionTest,
+      public ::testing::WithParamInterface<AutoMixedPrecisionMode> {
+ protected:
+  void SetUp() override {
+    mode_ = GetParam();
+    AutoMixedPrecisionTest::SetMode(mode_);
+    AutoMixedPrecisionTest::SetUp();
+  }
+  AutoMixedPrecisionMode mode_;
+};
+
+TEST_P(AutoMixedPrecisionParamTest, NoOp) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -228,7 +255,7 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -251,7 +278,7 @@ TEST_F(AutoMixedPrecisionTest, NoOp) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
+TEST_P(AutoMixedPrecisionParamTest, AlreadyFp16) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
@@ -266,7 +293,7 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
   VLOG(1) << output.DebugString();
@@ -289,7 +316,7 @@ TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, Simple) {
+TEST_P(AutoMixedPrecisionParamTest, Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -309,7 +336,7 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -338,10 +365,9 @@ TEST_F(AutoMixedPrecisionTest, Simple) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, NoInferOp) {
+TEST_P(AutoMixedPrecisionParamTest, NoInferOp) {
   setenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL", "TREAT_INFER_AS_DENY",
          1 /* replace */);
-
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -361,7 +387,7 @@ TEST_F(AutoMixedPrecisionTest, NoInferOp) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -390,7 +416,7 @@ TEST_F(AutoMixedPrecisionTest, NoInferOp) {
   unsetenv("TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LEVEL");
 }
 
-TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
+TEST_P(AutoMixedPrecisionParamTest, BidirectionalClearChain) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -406,7 +432,7 @@ TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -429,7 +455,7 @@ TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
+TEST_P(AutoMixedPrecisionParamTest, PreserveFetches) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -447,7 +473,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -474,7 +500,10 @@ TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
+TEST_P(AutoMixedPrecisionParamTest, PreserveCPUNodes) {
+  if (mode_ == AutoMixedPrecisionMode::FP16_CPU) {
+    GTEST_SKIP() << "This test is not required on CPU";
+  }
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
@@ -492,7 +521,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -515,7 +544,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
+TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
@@ -535,7 +564,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
   std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -559,7 +588,7 @@ TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
+TEST_P(AutoMixedPrecisionParamTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
@@ -592,7 +621,7 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -618,7 +647,7 @@ TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
+TEST_P(AutoMixedPrecisionParamTest, RepeatedAndListTypeAttrs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output allow1 = ops::MatMul(s.WithOpName("allow1"), input, input);
@@ -634,7 +663,7 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -658,7 +687,7 @@ TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, ExistingCast) {
+TEST_P(AutoMixedPrecisionParamTest, ExistingCast) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
@@ -670,7 +699,7 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -690,7 +719,7 @@ TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
+TEST_P(AutoMixedPrecisionParamTest, RecurrentEdgeColorMismatch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output deny1 = ops::Exp(s.WithOpName("deny1"), input);
@@ -723,7 +752,7 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   const_node->add_input("^mrg1");
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -752,7 +781,7 @@ TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
+TEST_P(AutoMixedPrecisionParamTest, TensorListSetGet) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -794,7 +823,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -823,7 +852,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
+TEST_P(AutoMixedPrecisionParamTest, TensorListPushPop) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -857,7 +886,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -886,7 +915,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
+TEST_P(AutoMixedPrecisionParamTest, TensorListFromTensor) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
@@ -911,7 +940,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -936,7 +965,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
+TEST_P(AutoMixedPrecisionParamTest, TensorListPushBackBatchAndConcatLists) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
@@ -971,7 +1000,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -996,7 +1025,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
+TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
@@ -1057,7 +1086,7 @@ TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1104,7 +1133,7 @@ bool IsSupportedGPU(const Cluster& cluster) {
 #endif
 }
 
-TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
+TEST_P(AutoMixedPrecisionParamTest, BatchMatMul) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output allow1 = ops::BatchMatMul(s.WithOpName("allow1"), input, input);
@@ -1115,7 +1144,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
 
-  AutoMixedPrecision optimizer;
+  AutoMixedPrecision optimizer(mode_);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
@@ -1139,7 +1168,7 @@ TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
   }
 }
 
-TEST_F(AutoMixedPrecisionTest, EluOp) {
+TEST_P(AutoMixedPrecisionParamTest, EluOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1147,7 +1176,7 @@ TEST_F(AutoMixedPrecisionTest, EluOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, ErfOp) {
+TEST_P(AutoMixedPrecisionParamTest, ErfOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1155,7 +1184,7 @@ TEST_F(AutoMixedPrecisionTest, ErfOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, ErfcOp) {
+TEST_P(AutoMixedPrecisionParamTest, ErfcOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1163,7 +1192,7 @@ TEST_F(AutoMixedPrecisionTest, ErfcOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, InvOp) {
+TEST_P(AutoMixedPrecisionParamTest, InvOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1171,7 +1200,7 @@ TEST_F(AutoMixedPrecisionTest, InvOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, LogOp) {
+TEST_P(AutoMixedPrecisionParamTest, LogOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, 1.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1179,7 +1208,7 @@ TEST_F(AutoMixedPrecisionTest, LogOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, Log1pOp) {
+TEST_P(AutoMixedPrecisionParamTest, Log1pOp) {
   TestSimpleUnaryInferOp(
       -0.99, 9, 1.0e-3, 5.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1187,7 +1216,7 @@ TEST_F(AutoMixedPrecisionTest, Log1pOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
+TEST_P(AutoMixedPrecisionParamTest, LogSoftmaxOp) {
   TestSimpleUnaryInferOp(
       -8, 8, -1, 1.0e-2,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1195,7 +1224,7 @@ TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
+TEST_P(AutoMixedPrecisionParamTest, ReciprocalOp) {
   TestSimpleUnaryInferOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1203,7 +1232,7 @@ TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
+TEST_P(AutoMixedPrecisionParamTest, SigmoidOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1211,7 +1240,7 @@ TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
+TEST_P(AutoMixedPrecisionParamTest, SoftmaxOp) {
   TestSimpleUnaryInferOp(
       -8, 8, 2.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1219,7 +1248,7 @@ TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
+TEST_P(AutoMixedPrecisionParamTest, SoftplusOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 2.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1227,7 +1256,7 @@ TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, SqrtOp) {
+TEST_P(AutoMixedPrecisionParamTest, SqrtOp) {
   TestSimpleUnaryInferOp(
       0, 10, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1235,7 +1264,7 @@ TEST_F(AutoMixedPrecisionTest, SqrtOp) {
       });
 }
 
-TEST_F(AutoMixedPrecisionTest, TanhOp) {
+TEST_P(AutoMixedPrecisionParamTest, TanhOp) {
   TestSimpleUnaryInferOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
@@ -1243,6 +1272,19 @@ TEST_F(AutoMixedPrecisionTest, TanhOp) {
       });
 }
 
+constexpr AutoMixedPrecisionMode kTestValues[] = {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    AutoMixedPrecisionMode::CUDA,
+#endif
+#if INTEL_MKL
+    AutoMixedPrecisionMode::FP16_CPU,
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(AutoMixedPrecisionTest, AutoMixedPrecisionParamTest,
+                         ::testing::ValuesIn(kTestValues));
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 class AutoMixedPrecisionCpuTest : public GrapplerTest {
  protected:
   void SetUp() override {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 5fe1d762b2c517..84afab6e12badf 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -231,6 +231,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
          new AutoMixedPrecision(AutoMixedPrecisionMode::CUDA));
 #ifdef INTEL_MKL
   if (IsMKLEnabled()) {
+    MK_OPT("auto_mixed_precision", "auto_mixed_precision",
+           new AutoMixedPrecision(AutoMixedPrecisionMode::FP16_CPU));
     MK_OPT("auto_mixed_precision_mkl", "auto_mixed_precision_mkl",
            new AutoMixedPrecision(AutoMixedPrecisionMode::BF16));
     MK_OPT("auto_mixed_precision_onednn_bfloat16",
@@ -374,6 +376,8 @@ Status MetaOptimizer::InitializeOptimizers(
   if (AutoMixedPrecisionEnabled(cfg_.auto_mixed_precision()) &&
       AutoMixedPrecisionEnabled(
           plugin_configs.toggle_config["auto_mixed_precision"])) {
+    optimizers->push_back(
+        std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::FP16_CPU));
     optimizers->push_back(
         std::make_unique<AutoMixedPrecision>(AutoMixedPrecisionMode::CUDA));
   }
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index e73a367d8420d2..71d75dcbf9c704 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -1,10 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_protos_grappler",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index 2d3cc54c72e510..959a9955b8eab5 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/BUILD b/tensorflow/core/ir/BUILD
index e29a03da7ded2e..5fc887ce213d76 100644
--- a/tensorflow/core/ir/BUILD
+++ b/tensorflow/core/ir/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -140,10 +140,12 @@ cc_library(
         ":InterfacesIncGen",
         "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index c97dc466c7e826..5694f228b32b7b 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
diff --git a/tensorflow/core/ir/importexport/tests/BUILD b/tensorflow/core/ir/importexport/tests/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/BUILD
+++ b/tensorflow/core/ir/importexport/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
index f25d004408d86f..32d6b56232765c 100644
--- a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
+++ b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
index 68789a9126b4a1..e80b874a1ddcd4 100644
--- a/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
+++ b/tensorflow/core/ir/importexport/tests/roundtrip/BUILD
@@ -1,6 +1,7 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
 # Regression tests for bridge.
 load(":roundtrip.bzl", "glob_roundtrip_tests")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/tests/BUILD b/tensorflow/core/ir/tests/BUILD
index 14304dfdee7d5e..97e8aa27a235ff 100644
--- a/tensorflow/core/ir/tests/BUILD
+++ b/tensorflow/core/ir/tests/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index eee60c360ce497..2181219addb51f 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -1,6 +1,6 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 49af6254d918df..43177edefc1bca 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2,6 +2,16 @@ load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_hipblaslt",
+)
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
@@ -20,11 +30,6 @@ load(
     "tf_opts_nortti_if_lite_protos",
 )
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "get_compatible_with_portable", "tf_cc_shared_library", "tf_cuda_cc_test", "tf_cuda_cc_tests", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-    "mkl_deps",
-)
 load(
     "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
     "if_mlir_generated_experimental_kernels_enabled",
@@ -52,11 +57,6 @@ load(
     "tf_cuda_tests_tags",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-    "if_rocm_hipblaslt",
-)
 
 # Description:
 # Op kernel implementations for TensorFlow.
@@ -1657,19 +1657,34 @@ cc_library(
 
 tf_cc_test(
     name = "batch_kernels_test",
-    size = "small",
+    size = "medium",
     srcs = ["batch_kernels_test.cc"],
     features = ["-layering_check"],
     deps = [
         ":batch_kernel_test_util",
         ":batch_kernels",
+        ":cwise_op",
         ":function_ops",
         ":shape_ops",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels/batching_util:warmup",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/public:version",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:refcount",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -3430,7 +3445,7 @@ tf_cc_tests(
         "//tensorflow/core:testlib",
         "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:determinism_test_util",
+        "@local_xla//xla/tsl/util:determinism_test_util",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index 68ae309504ffb6..4b3b04bda413ae 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/device_factory.h"
@@ -39,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/blocking_counter.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/status.h"
@@ -65,10 +67,364 @@ TEST_P(BatchFunctionKernelTest, EnableAdaptiveScheduler) {
 
 INSTANTIATE_TEST_SUITE_P(Params, BatchFunctionKernelTest, ::testing::Bool());
 
-class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
+class SharedBatchFunctionTestState : public OpsTestBase {
  public:
   // Init test fixture with a batch kernel instance.
-  Status Init(bool enable_splitting) {
+  void CreateFunctionLibraryRuntime() {
+    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+        /*thread_pool=*/nullptr, /*parent=*/nullptr,
+        /*session_metadata=*/nullptr,
+        Rendezvous::Factory{[](const int64_t, const DeviceMgr *device_mgr,
+                               tsl::core::RefCountPtr<Rendezvous> *r) {
+          *r = tsl::core::RefCountPtr<Rendezvous>(
+              new IntraProcessRendezvous(device_mgr));
+          return absl::OkStatus();
+        }});
+  }
+};
+
+class BatchFunctionTestState : public SharedBatchFunctionTestState {
+ public:
+  // Init test fixture with a batch kernel instance. The caller guarantees that
+  // the device pointer is valid throughout the life of this class.
+  absl::Status Init(Device *device, bool enable_low_priority_queue,
+                    int64_t expected_batch_size) {
+    // Override the per-test/per-op device with a given device so that it can
+    // be shared between ops.
+    device_ = device;
+
+    NameAttrList f;
+    f.set_name("ShapeEnforcingFunction");
+    FunctionDef func = FunctionDefHelper::Create(
+        // function_name
+        f.name(),
+        // in_def
+        {"x:int64"},
+        // out_def
+        {"o:int64"},
+        // attr_def
+        {},
+        // node_def
+        {{{"o"},
+          "EnsureShape",
+          {"x"},
+          {{"T", DataType::DT_INT64},
+           {"shape", TensorShape({expected_batch_size, 2})}}}},
+        // ret_def
+        {{"o", "o:output"}});
+    TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
+    SharedBatchFunctionTestState::CreateFunctionLibraryRuntime();
+
+    std::vector<NodeDefBuilder::NodeOut> inputs(
+        {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
+    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                           .Attr("max_batch_size", 4)
+                           .Attr("num_batch_threads", 4)
+                           .Attr("allowed_batch_sizes", {4})
+                           .Attr("batch_timeout_micros", 1000000)
+                           .Attr("max_enqueued_batches", 10)
+                           .Attr("low_priority_max_batch_size",
+                                 enable_low_priority_queue ? 8 : 0)
+                           .Attr("low_priority_batch_timeout_micros",
+                                 enable_low_priority_queue ? 2000000 : 0)
+                           .Attr("low_priority_allowed_batch_sizes",
+                                 enable_low_priority_queue ? std::vector<int>{8}
+                                                           : std::vector<int>())
+                           .Attr("low_priority_max_enqueued_batches",
+                                 enable_low_priority_queue ? 2 : 0)
+                           .Attr("Tin", {DataType::DT_INT64})
+                           .Input(inputs)
+                           .Attr("Tcaptured", std::vector<DataType>{})
+                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
+                           .Attr("Tout", std::vector<DataType>{DT_INT64})
+                           .Attr("f", f)
+                           .Finalize(node_def()));
+    return OpsTestBase::InitOp();
+  }
+
+  void TestBody() override {}
+};
+
+class BatchFunctionTest : public ::testing::TestWithParam<bool> {
+ protected:
+  void SetUp() override {
+    // The device needs to be shared in each test case and within each test case
+    // only.
+    cpu_device_ =
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
+  }
+  std::unique_ptr<Device> cpu_device_;
+};
+
+TEST_P(BatchFunctionTest, BatchingWorksWithoutCriticality) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(4);
+    // 8 threads run the batch op with no explicit criticality set. They are
+    // eventually batched to form a tensor with [4, 2] shape which is verified
+    // within the function.
+    for (int i = 0; i < 4; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCritical);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_P(BatchFunctionTest, PaddingWorksWithoutCriticality) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 2 threads run the batch op with no explicit criticality set. They are
+    // eventually batched and padded to form a tensor with [4, 2] shape which is
+    // verified within the function.
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCritical);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+#if defined(PLATFORM_GOOGLE)
+TEST_P(BatchFunctionTest, LowPriorityTaskPaddingHighPriorityBatch) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(4);
+    // 2 threads run the batch op with critical plus and 2 threads run the batch
+    // op with sheddable. They are eventually batched to form a tensor with [4,
+    // 2] shape which is verified within the function.
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kCriticalPlus);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kCriticalPlus);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     enable_low_priority_queue,
+                                     /*expected_batch_size=*/4));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_P(BatchFunctionTest,
+       LowPriorityTaskPaddingHighPriorityBatchWithExtraPadding) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  bool enable_low_priority_queue = GetParam();
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 1 thread run the batch op with critical plus and 1 threads run the batch
+    // op with sheddable. They are eventually batched and padded to form a
+    // tensor with [4, 2] shape which is verified within the function.
+    Env::Default()->SchedClosure([&]() {
+      tsl::criticality::ScopedCriticality scoped_criticality(
+          tsl::criticality::Criticality::kCriticalPlus);
+      ASSERT_EQ(tsl::criticality::GetCriticality(),
+                tsl::criticality::Criticality::kCriticalPlus);
+
+      BatchFunctionTestState test_state;
+      test_state.set_session_metadata(session_metadata);
+      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                                   /*expected_batch_size=*/4));
+      test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {123, 456});
+      TF_EXPECT_OK(test_state.RunOpKernel());
+
+      test::ExpectTensorEqual<int64_t>(
+          *test_state.GetOutput(0),
+          test::AsTensor<int64_t>({123, 456}, TensorShape({1, 2})));
+      blocking_counter.DecrementCount();
+    });
+
+    Env::Default()->SchedClosure([&]() {
+      tsl::criticality::ScopedCriticality scoped_criticality(
+          tsl::criticality::Criticality::kSheddable);
+      ASSERT_EQ(tsl::criticality::GetCriticality(),
+                tsl::criticality::Criticality::kSheddable);
+
+      BatchFunctionTestState test_state;
+      test_state.set_session_metadata(session_metadata);
+      TF_ASSERT_OK(test_state.Init(cpu_device_.get(), enable_low_priority_queue,
+                                   /*expected_batch_size=*/4));
+      test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+      TF_EXPECT_OK(test_state.RunOpKernel());
+
+      test::ExpectTensorEqual<int64_t>(
+          *test_state.GetOutput(0),
+          test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+      blocking_counter.DecrementCount();
+    });
+
+    blocking_counter.Wait();
+  }
+}
+#endif
+
+INSTANTIATE_TEST_SUITE_P(BatchFunctionTest, BatchFunctionTest,
+                         ::testing::Bool());
+
+#if defined(PLATFORM_GOOGLE)
+TEST_F(BatchFunctionTest, LowPriorityOnlyBatchAtMaxLowPriorityBatchSize) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  {
+    tsl::BlockingCounter blocking_counter(8);
+    // 8 threads run the batch op with sheddable. They are eventually batched to
+    // form a tensor with [8, 2] shape, which is verified within the function,
+    // since the low priority max batch size is set to 8.
+    for (int i = 0; i < 8; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     /*expected_batch_size=*/8));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+
+TEST_F(BatchFunctionTest, LowPriorityBatchPaddedToLowPriorityAllowedBatchSize) {
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+
+  {
+    tsl::BlockingCounter blocking_counter(2);
+    // 2 threads run the batch op with sheddable. They are eventually batched
+    // and padded to form a tensor with [8, 2] shape, which is verified within
+    // the function, since the low priority allowed batch size is set to [8].
+    for (int i = 0; i < 2; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        tsl::criticality::ScopedCriticality scoped_criticality(
+            tsl::criticality::Criticality::kSheddable);
+        ASSERT_EQ(tsl::criticality::GetCriticality(),
+                  tsl::criticality::Criticality::kSheddable);
+
+        BatchFunctionTestState test_state;
+        test_state.set_session_metadata(session_metadata);
+        TF_ASSERT_OK(test_state.Init(cpu_device_.get(),
+                                     /*enable_low_priority_queue=*/true,
+                                     /*expected_batch_size=*/8));
+        test_state.AddInputFromList<int64_t>(TensorShape({1, 2}), {234, 567});
+        TF_EXPECT_OK(test_state.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test_state.GetOutput(0),
+            test::AsTensor<int64_t>({234, 567}, TensorShape({1, 2})));
+        blocking_counter.DecrementCount();
+      });
+    }
+
+    blocking_counter.Wait();
+  }
+}
+#endif
+
+class BatchFunctionKernelParallelWarmupTestState
+    : public SharedBatchFunctionTestState {
+ public:
+  // Init test fixture with a batch kernel instance.
+  absl::Status Init(bool enable_splitting) {
     static auto *const cpu_device = []() {
       auto device =
           DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
@@ -98,40 +454,29 @@ class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
         // ret_def
         {{"o", "o:output"}});
     TF_RETURN_IF_ERROR(flib_def_->AddFunctionDef(func));
-
-    pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
-        device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
-        /*thread_pool=*/nullptr, /*parent=*/nullptr,
-        /*session_metadata=*/nullptr,
-        Rendezvous::Factory{[](const int64_t, const DeviceMgr *device_mgr,
-                               tsl::core::RefCountPtr<Rendezvous> *r) {
-          *r = tsl::core::RefCountPtr<Rendezvous>(
-              new IntraProcessRendezvous(device_mgr));
-          return absl::OkStatus();
-        }});
+    SharedBatchFunctionTestState::CreateFunctionLibraryRuntime();
 
     std::vector<NodeDefBuilder::NodeOut> inputs(
         {NodeDefBuilder::NodeOut({"n1", 0, DataType::DT_INT64})});
-    TF_CHECK_OK(NodeDefBuilder("BatchTPUInput", "BatchFunction")
-                    .Attr("max_batch_size", enable_splitting ? 16 : 8)
-                    .Attr("num_batch_threads", 8)
-                    .Attr("allowed_batch_sizes", {2, 4, 8})
-                    .Attr("batch_timeout_micros", 1000000)
-                    .Attr("max_enqueued_batches", 10)
-                    .Attr("enable_large_batch_splitting", true)
-                    .Attr("low_priority_max_batch_size", 64)
-                    .Attr("low_priority_batch_timeout_micros", 8000)
-                    .Attr("low_priority_allowed_batch_sizes", {32, 64})
-                    .Attr("low_priority_max_enqueued_batches", 1000)
-                    .Attr("Tin", {DataType::DT_INT64})
-                    .Input(inputs)
-                    .Attr("Tcaptured", std::vector<DataType>{})
-                    .Input(std::vector<NodeDefBuilder::NodeOut>{})
-                    .Attr("Tout", std::vector<DataType>{DT_INT64})
-                    .Attr("f", f)
-                    .Finalize(node_def()));
-    return InitOp();
+    TF_RETURN_IF_ERROR(NodeDefBuilder("BatchTPUInput", "BatchFunction")
+                           .Attr("max_batch_size", enable_splitting ? 16 : 8)
+                           .Attr("num_batch_threads", 8)
+                           .Attr("allowed_batch_sizes", {2, 4, 8})
+                           .Attr("batch_timeout_micros", 1000000)
+                           .Attr("max_enqueued_batches", 10)
+                           .Attr("enable_large_batch_splitting", true)
+                           .Attr("low_priority_max_batch_size", 64)
+                           .Attr("low_priority_batch_timeout_micros", 8000)
+                           .Attr("low_priority_allowed_batch_sizes", {32, 64})
+                           .Attr("low_priority_max_enqueued_batches", 1000)
+                           .Attr("Tin", {DataType::DT_INT64})
+                           .Input(inputs)
+                           .Attr("Tcaptured", std::vector<DataType>{})
+                           .Input(std::vector<NodeDefBuilder::NodeOut>{})
+                           .Attr("Tout", std::vector<DataType>{DT_INT64})
+                           .Attr("f", f)
+                           .Finalize(node_def()));
+    return OpsTestBase::InitOp();
   }
 
   void TestBody() override {}
@@ -200,5 +545,6 @@ TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmup) {
 INSTANTIATE_TEST_SUITE_P(BatchFunctionKernelParallelWarmupTestSuite,
                          BatchFunctionKernelParallelWarmupTest,
                          ::testing::Bool());
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 2d9e06650e54b5..50c5631d8e6ba1 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -160,7 +160,9 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme_encode",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:criticality",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -179,7 +181,9 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme_encode",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:criticality",
     ],
     alwayslink = 1,
 )
@@ -199,6 +203,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
 )
@@ -391,6 +396,7 @@ cc_library(
         "//tensorflow/core/util:incremental_barrier",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 98a83fda8833a5..2f3195bd9fc3b7 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/bind_front.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/kernels/batching_util/input_split_metadata.h"
 #include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
@@ -69,6 +71,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/incremental_barrier.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -272,6 +275,17 @@ const string& GetModelName(OpKernelContext* ctx) {
   return ctx->session_metadata()->name();
 }
 
+// Returns the sum of the task sizes. The caller must guarantee that the
+// unique_ptrs in the argument vectors are not null.
+int GetTotalTaskSize(
+    const std::vector<std::unique_ptr<BatchResourceBase::BatchTask>>& tasks) {
+  int tasks_size = 0;
+  for (const auto& task : tasks) {
+    tasks_size += task->size();
+  }
+  return tasks_size;
+}
+
 }  // namespace
 
 std::unique_ptr<BatchResourceBase::BatchTask>
@@ -519,6 +533,15 @@ BatchResourceBase::GetBatcherQueueOptions(
       low_priority_max_enqueued_batches;
   batcher_queue_options.low_priority_queue_options.batch_timeout_micros =
       low_priority_batch_timeout_micros;
+  if (low_priority_allowed_batch_sizes.empty()) {
+    batcher_queue_options.low_priority_queue_options.max_execution_batch_size =
+        low_priority_max_batch_size;
+  } else {
+    batcher_queue_options.low_priority_queue_options.max_execution_batch_size =
+        *low_priority_allowed_batch_sizes.rbegin();
+  }
+  batcher_queue_options.low_priority_queue_options.allowed_batch_sizes =
+      low_priority_allowed_batch_sizes;
   batcher_queue_options.enable_large_batch_splitting =
       enable_large_batch_splitting;
   if (enable_large_batch_splitting) {
@@ -541,14 +564,6 @@ BatchResourceBase::GetBatcherQueueOptions(
           .max_execution_batch_size = *allowed_batch_sizes.rbegin();
       batcher_queue_options.allowed_batch_sizes = allowed_batch_sizes;
     }
-    if (low_priority_allowed_batch_sizes.empty()) {
-      batcher_queue_options.low_priority_queue_options
-          .max_execution_batch_size = low_priority_max_batch_size;
-    } else {
-      batcher_queue_options.low_priority_queue_options
-          .max_execution_batch_size =
-          *low_priority_allowed_batch_sizes.rbegin();
-    }
   }
   batcher_queue_options.disable_padding = disable_padding;
 
@@ -598,14 +613,32 @@ BatchResourceBase::GetAdaptiveBatcherQueueOptions(
   return absl::OkStatus();
 }
 
+bool BatchResourceBase::IsLowPriorityBatch(const BatchT& batch) const {
+  if (!batcher_queue_options_.enable_priority_queue) return false;
+  if (batch.empty()) return false;
+
+  // TODO(b/316379576): Once the criticality and priority become configurable,
+  // this should rely on the batch parameters instead of the hard coded value.
+  return batch.task(0).criticality() ==
+             tsl::criticality::Criticality::kSheddablePlus ||
+         batch.task(0).criticality() ==
+             tsl::criticality::Criticality::kSheddable;
+}
+
 // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
 // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
 // returns 'batch_size'.
-int BatchResourceBase::RoundToLowestAllowedBatchSize(int batch_size) const {
-  if (batcher_queue_options_.disable_padding || allowed_batch_sizes_.empty()) {
+int BatchResourceBase::RoundToLowestAllowedBatchSize(
+    int batch_size, bool is_low_priority_batch) const {
+  const std::vector<int32>& allowed_batch_sizes =
+      is_low_priority_batch ? batcher_queue_options_.low_priority_queue_options
+                                  .allowed_batch_sizes
+                            : allowed_batch_sizes_;
+
+  if (batcher_queue_options_.disable_padding || allowed_batch_sizes.empty()) {
     return batch_size;
   }
-  for (int allowed_size : allowed_batch_sizes_) {
+  for (int allowed_size : allowed_batch_sizes) {
     if (allowed_size >= batch_size) {
       return allowed_size;
     }
@@ -617,17 +650,23 @@ int BatchResourceBase::RoundToLowestAllowedBatchSize(int batch_size) const {
 }
 
 Status BatchResourceBase::ConcatInputTensors(
-    const BatchT& batch, OpKernelContext* context,
-    std::vector<Tensor>* concatenated_tensors) const {
+    const BatchT& batch,
+    const std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks,
+    OpKernelContext* context, std::vector<Tensor>* concatenated_tensors) const {
   if (batch.num_tasks() == 0) {
     return errors::InvalidArgument("Empty batch.");
   }
+
+  int unbatched_tasks_size = GetTotalTaskSize(unbatched_tasks);
   const bool just_for_warmup = batch.task(0).forced_warmup_batch_size > 0;
   const int padded_batch_size =
-      just_for_warmup ? batch.task(0).forced_warmup_batch_size
-                      : RoundToLowestAllowedBatchSize(batch.size());
+      just_for_warmup
+          ? batch.task(0).forced_warmup_batch_size
+          : RoundToLowestAllowedBatchSize(batch.size() + unbatched_tasks_size,
+                                          IsLowPriorityBatch(batch));
   const int padding_amount =
-      just_for_warmup ? padded_batch_size : padded_batch_size - batch.size();
+      just_for_warmup ? padded_batch_size
+                      : padded_batch_size - batch.size() - unbatched_tasks_size;
   profiler::TraceMe trace_me([padded_batch_size, padding_amount,
                               disable_padding =
                                   batcher_queue_options_.disable_padding]() {
@@ -636,6 +675,9 @@ Status BatchResourceBase::ConcatInputTensors(
                                {"padding_amount", padding_amount},
                                {"disable_padding", disable_padding}});
   });
+  // TODO(b/316379576): Add metrics for the breakdown between the size of the
+  // original batch size and the unbatched task size and update the batch size
+  // to include the unbatched tasks.
   RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size,
                     context->op_kernel().name());
   RecordPaddingSizeV2(padding_amount, GetModelName(context), padded_batch_size,
@@ -660,10 +702,14 @@ Status BatchResourceBase::ConcatInputTensors(
     if (just_for_warmup) {
       to_concatenate.reserve(padding_amount);
     } else {
-      to_concatenate.reserve(batch.num_tasks() + padding_amount);
+      to_concatenate.reserve(batch.num_tasks() + unbatched_tasks.size() +
+                             padding_amount);
       for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
         to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
       }
+      for (int task_idx = 0; task_idx < unbatched_tasks.size(); ++task_idx) {
+        to_concatenate.push_back(unbatched_tasks[task_idx]->inputs.at(i));
+      }
     }
 
     // Add padding as needed if padding is allowed. Use the first row of the
@@ -794,7 +840,8 @@ Status BatchResourceBase::ConcatInputTensors(
 }
 
 Status BatchResourceBase::SplitOutputTensors(
-    const std::vector<Tensor>& combined_outputs, BatchT* batch) const {
+    const std::vector<Tensor>& combined_outputs, BatchT* batch,
+    std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks) const {
   DCHECK_GE(batch->num_tasks(), 1);
   if (batch->num_tasks() < 1) {
     return errors::Internal("Batch size expected to be positive; was ",
@@ -802,14 +849,21 @@ Status BatchResourceBase::SplitOutputTensors(
   }
 
   std::vector<int64_t> task_sizes_plus_optional_padding;
-  task_sizes_plus_optional_padding.reserve(batch->num_tasks());
+  task_sizes_plus_optional_padding.reserve(batch->num_tasks() +
+                                           unbatched_tasks.size());
   for (int i = 0; i < batch->num_tasks(); ++i) {
     task_sizes_plus_optional_padding.push_back(batch->task(i).size());
   }
+  for (int i = 0; i < unbatched_tasks.size(); ++i) {
+    task_sizes_plus_optional_padding.push_back(unbatched_tasks[i]->size());
+  }
+  int unbatched_tasks_size = GetTotalTaskSize(unbatched_tasks);
   const int padding_size =
       batcher_queue_options_.disable_padding
           ? 0
-          : RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+          : RoundToLowestAllowedBatchSize(batch->size() + unbatched_tasks_size,
+                                          IsLowPriorityBatch(*batch)) -
+                batch->size() - unbatched_tasks_size;
   if (padding_size > 0) {
     task_sizes_plus_optional_padding.push_back(padding_size);
   }
@@ -829,7 +883,8 @@ Status BatchResourceBase::SplitOutputTensors(
           "Batched output tensor has 0 dimensions");
     }
     if (output_tensor.shape().dim_size(0) !=
-        static_cast<int64_t>(batch->size() + padding_size)) {
+        static_cast<int64_t>(batch->size() + unbatched_tasks_size +
+                             padding_size)) {
       return errors::FailedPrecondition(
           "Batched output tensor's 0th dimension does not equal the sum of "
           "the 0th dimension sizes of the input tensors");
@@ -861,12 +916,35 @@ Status BatchResourceBase::SplitOutputTensors(
         task.context->set_output(i, split_tensor[j]);
       }
     }
+    for (int j = 0; j < unbatched_tasks.size(); ++j) {
+      // The unbatched tasks are not split, so no need to handle the partial
+      // case separately.
+      unbatched_tasks[j]->context->set_output(
+          i, split_tensor[batch->num_tasks() + j]);
+    }
   }
 
   return absl::OkStatus();
 }
 
-void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
+void BatchResourceBase::CleanUpFunctionHelper(BatchTask& task,
+                                              const Status& status) const {
+  WithContext wc(task.propagated_context);
+  if (!status.ok()) {
+    if (!absl::StrContains(status.message(),
+                           "Function was cancelled before it was started")) {
+      task.status->Update(status);
+    } else {
+      // Do not propagate this error; Prefer a more helpful error message.
+      LOG(ERROR) << "ERROR!!!! " << status.message();
+    }
+  }
+  task.done_callback();
+}
+
+void BatchResourceBase::ProcessFuncBatch(
+    std::unique_ptr<BatchT> batch,
+    std::vector<std::unique_ptr<BatchTask>> unbatched_tasks) const {
   if (batch->empty()) {
     return;
   }
@@ -896,24 +974,19 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
     if (cleanup_done) {
       return;
     }
+    // TODO(b/316379576): Update this to take the unbatch task cost into
+    // consideration when excluding the wasted cost and propagate cost to the
+    // unbatched tasks.
     SplitBatchCostsAndRecordMetrics(model_name, batch_cost_measurements,
                                     processed_size, *batch);
     // Clear the measurements before unblocking the batch task, as measurements
     // are associated with the task's thread context.
     batch_cost_measurements.clear();
     for (int i = 0; i < batch->num_tasks(); ++i) {
-      WithContext wc(batch->task(i).propagated_context);
-      if (!status.ok()) {
-        if (!absl::StrContains(
-                status.message(),
-                "Function was cancelled before it was started")) {
-          batch->mutable_task(i)->status->Update(status);
-        } else {
-          // Do not propagate this error; Prefer a more helpful error message.
-          LOG(ERROR) << "ERROR!!!! " << status.message();
-        }
-      }
-      batch->mutable_task(i)->done_callback();
+      CleanUpFunctionHelper(*batch->mutable_task(i), status);
+    }
+    for (int i = 0; i < unbatched_tasks.size(); ++i) {
+      CleanUpFunctionHelper(*unbatched_tasks[i], status);
     }
     cleanup_done = true;
   };
@@ -927,7 +1000,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
   }
 
   std::vector<Tensor> concatenated_tensors;
-  status = ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+  status = ConcatInputTensors(*batch, unbatched_tasks, last_task_context,
+                              &concatenated_tensors);
   processed_size = RoundToLowestAllowedBatchSize(batch->size());
   if (!status.ok()) {
     return;
@@ -969,7 +1043,8 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
           return;
         }
         if (last_task.forced_warmup_batch_size == 0) {
-          final_status = SplitOutputTensors(combined_outputs, batch.get());
+          final_status = SplitOutputTensors(combined_outputs, batch.get(),
+                                            unbatched_tasks);
         }
       });
 }
@@ -1011,7 +1086,7 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
   const int num_input_edges = batch->task(0).inputs.size();
   std::vector<Tensor> concatenated_tensors;
   const Status concat_status =
-      ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+      ConcatInputTensors(*batch, {}, last_task_context, &concatenated_tensors);
   processed_size = RoundToLowestAllowedBatchSize(batch->size());
   OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
 
@@ -1081,6 +1156,20 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
   return absl::OkStatus();
 }
 
+void BatchResourceBase::ProcessBatchCallBack(
+    std::unique_ptr<Batch<BatchTask>> batch,
+    std::vector<std::unique_ptr<BatchTask>> unbatched_tasks) {
+  if (!session_metadata().name().empty()) {
+    absl::MutexLock lock(&outstanding_batch_mu_);
+    num_outstanding_batched_items_ -= batch->size();
+  }
+  if (!has_process_batch_function_) {
+    ProcessBatch(std::move(batch));
+  } else {
+    ProcessFuncBatch(std::move(batch), std::move(unbatched_tasks));
+  }
+}
+
 // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
 // creates it.
 Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
@@ -1094,23 +1183,19 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
   }
 
   std::unique_ptr<BatcherQueueT> new_queue;
-  auto process_batch_callback = [this](std::unique_ptr<BatchT> batch) {
-    if (!session_metadata().name().empty()) {
-      absl::MutexLock lock(&outstanding_batch_mu_);
-      num_outstanding_batched_items_ -= batch->size();
-    }
-    if (!has_process_batch_function_) {
-      ProcessBatch(std::move(batch));
-    } else {
-      ProcessFuncBatch(std::move(batch));
-    }
-  };
   if (batcher_) {
-    TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
-                                          process_batch_callback, &new_queue));
+    TF_RETURN_IF_ERROR(batcher_->AddQueue(
+        batcher_queue_options_,
+        absl::bind_front(&BatchResourceBase::ProcessBatchCallBack, this),
+        &new_queue));
   } else if (adaptive_batcher_) {
+    std::function<void(std::unique_ptr<Batch<BatchTask>>)>
+        reduced_process_batch_callback = [this](std::unique_ptr<BatchT> batch) {
+          ProcessBatchCallBack(std::move(batch), {});
+        };
     TF_RETURN_IF_ERROR(adaptive_batcher_->AddQueue(
-        adaptive_batcher_queue_options_, process_batch_callback, &new_queue));
+        adaptive_batcher_queue_options_, reduced_process_batch_callback,
+        &new_queue));
   } else {
     return errors::Internal("No batcher defined.");
   }
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index 1bd122e1d1dc9e..83b0840d5f7e48 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -266,22 +266,47 @@ class BatchResourceBase : public ResourceBase {
   // Assumes the batch is non-empty.
   static Status ValidateBatch(const BatchT& batch);
 
+  // Returns a boolean indicating whether a batch is formed from low priority
+  // tasks only or not.
+  bool IsLowPriorityBatch(const BatchT& batch) const;
+
   // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
   // or equal to 'batch_size'. If 'allowed_batch_sizes_' is empty, simply
   // returns 'batch_size'.
-  int RoundToLowestAllowedBatchSize(int batch_size) const;
-
-  Status ConcatInputTensors(const BatchT& batch, OpKernelContext* context,
-                            std::vector<Tensor>* concatenated_tensors) const;
-
-  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
-                            BatchT* batch) const;
-
-  void ProcessFuncBatch(std::unique_ptr<BatchT> batch) const;
+  int RoundToLowestAllowedBatchSize(int batch_size,
+                                    bool is_low_priority_batch = false) const;
+
+  // Helper function to propagate the status to the task's context and call the
+  // done callback on the task.
+  void CleanUpFunctionHelper(BatchTask& task, const Status& status) const;
+
+  // Concatenates the input tensors of the tasks from the batch and the
+  // unbatched task vector. When padding is enabled in the batcher queue, they
+  // are padded with garbage value up to the nearest allowed batch size.
+  Status ConcatInputTensors(
+      const BatchT& batch,
+      const std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks,
+      OpKernelContext* context,
+      std::vector<Tensor>* concatenated_tensors) const;
+
+  Status SplitOutputTensors(
+      const std::vector<Tensor>& combined_outputs, BatchT* batch,
+      std::vector<std::unique_ptr<BatchTask>>& unbatched_tasks) const;
+
+  void ProcessFuncBatch(
+      std::unique_ptr<BatchT> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks = {}) const;
 
   // Processes a batch of one or more BatchTask entries.
   void ProcessBatch(std::unique_ptr<BatchT> batch) const;
 
+  // Callback function that wraps the Process*Batch functions above. The caller
+  // of the callback must guarantee that the unique pointers passed as argument
+  // are not null.
+  void ProcessBatchCallBack(
+      std::unique_ptr<Batch<BatchTask>> batch,
+      std::vector<std::unique_ptr<BatchTask>> unbatched_tasks);
+
   // Emits an index tensor, which the Unbatch op will use to un-concatenate
   // the tensor and attribute the pieces to the right batch keys. The index
   // tensor contains, for each input: [batch_key, start_offset, end_offset]
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index fcee8f4463392f..f6ed5cf47aef91 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include <deque>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -87,8 +88,16 @@ class TaskQueue {
  public:
   TaskQueue() = default;
 
-  // Appends a task to the end of the queue.
-  void AddTask(std::unique_ptr<TaskType> task);
+  struct TaskWrapper {
+    std::unique_ptr<TaskType> task;
+    uint64 start_time_micros;
+
+    TaskWrapper(std::unique_ptr<TaskType> task, uint64 start_time_micros)
+        : task(std::move(task)), start_time_micros(start_time_micros) {}
+  };
+
+  // Appends a task to the end of the queue with the given start time.
+  void AddTask(std::unique_ptr<TaskType> task, uint64 start_time_micros);
 
   // Removes a task from the front of the queue, i.e., the oldest task in the
   // queue.
@@ -99,6 +108,10 @@ class TaskQueue {
   // argument.
   std::vector<std::unique_ptr<TaskType>> RemoveTask(int size);
 
+  // Returns the start time of the earliest task in the queue. If the queue is
+  // empty, return the null value.
+  std::optional<uint64> EarliestTaskStartTime() const;
+
   // Returns true iff the queue contains 0 tasks.
   bool empty() const;
 
@@ -112,7 +125,7 @@ class TaskQueue {
   mutable mutex mu_;
 
   // Tasks in the queue.
-  std::deque<std::unique_ptr<TaskType>> tasks_ TF_GUARDED_BY(mu_);
+  std::deque<TaskWrapper> tasks_ TF_GUARDED_BY(mu_);
 
   // The sum of the sizes of the tasks in 'tasks_'.
   int size_ TF_GUARDED_BY(mu_) = 0;
@@ -126,11 +139,12 @@ class TaskQueue {
 };
 
 template <typename TaskType>
-void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task) {
+void TaskQueue<TaskType>::AddTask(std::unique_ptr<TaskType> task,
+                                  uint64 start_time_micros) {
   {
     mutex_lock l(mu_);
     size_ += task->size();
-    tasks_.emplace_back(std::move(task));
+    tasks_.emplace_back(std::move(task), start_time_micros);
     empty_.store(false);
   }
 }
@@ -142,7 +156,7 @@ std::unique_ptr<TaskType> TaskQueue<TaskType>::RemoveTask() {
     if (tasks_.empty()) {
       return nullptr;
     }
-    std::unique_ptr<TaskType> task = std::move(tasks_.front());
+    std::unique_ptr<TaskType> task = std::move(tasks_.front().task);
     size_ -= task->size();
     tasks_.pop_front();
     if (tasks_.empty()) {
@@ -164,10 +178,10 @@ std::vector<std::unique_ptr<TaskType>> TaskQueue<TaskType>::RemoveTask(
     int size_lower_bound = size_ - size;
     std::vector<std::unique_ptr<TaskType>> remove_tasks;
     while (!tasks_.empty() &&
-           size_ - static_cast<int>(tasks_.front()->size()) >=
+           size_ - static_cast<int>(tasks_.front().task->size()) >=
                size_lower_bound) {
-      size_ -= static_cast<int>(tasks_.front()->size());
-      remove_tasks.push_back(std::move(tasks_.front()));
+      size_ -= static_cast<int>(tasks_.front().task->size());
+      remove_tasks.push_back(std::move(tasks_.front().task));
       tasks_.pop_front();
       if (tasks_.empty()) {
         empty_.store(true);
@@ -185,6 +199,19 @@ bool TaskQueue<TaskType>::empty() const {
   }
 }
 
+template <typename TaskType>
+std::optional<uint64> TaskQueue<TaskType>::EarliestTaskStartTime() const {
+  {
+    mutex_lock l(mu_);
+
+    if (tasks_.empty()) {
+      return std::nullopt;
+    }
+
+    return tasks_.front().start_time_micros;
+  }
+}
+
 template <typename TaskType>
 int TaskQueue<TaskType>::num_tasks() const {
   {
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index c9def0cd49916d..916e709c67015d 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
+#include <optional>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -64,7 +66,7 @@ TEST(TaskQueueTest, EmptyTaskQueue) {
 TEST(TaskQueueTest, AddTaskToTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
@@ -73,17 +75,17 @@ TEST(TaskQueueTest, AddTaskToTaskQueue) {
 TEST(TaskQueueTest, AddTasksToTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -92,7 +94,7 @@ TEST(TaskQueueTest, AddTasksToTaskQueue) {
 TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithSingleTask) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
@@ -107,12 +109,12 @@ TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithSingleTask) {
 TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithMultipleTasks) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(2, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
@@ -127,17 +129,17 @@ TEST(TaskQueueTest, RemoveTaskFromTaskQueueWithMultipleTasks) {
 TEST(TaskQueueTest, RemoveTasksFromTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -155,17 +157,17 @@ TEST(TaskQueueTest, RemoveTasksFromTaskQueue) {
 TEST(TaskQueueTest, RemoveTasksFewerThanArgFromTaskQueue) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -183,17 +185,17 @@ TEST(TaskQueueTest, RemoveTasksFewerThanArgFromTaskQueue) {
 TEST(TaskQueueTest, RemoveAllTasksWhenArgGreaterThanTaskSize) {
   TaskQueue<FakeTask> task_queue;
 
-  task_queue.AddTask(std::make_unique<FakeTask>(1));
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(1, task_queue.num_tasks());
   EXPECT_EQ(1, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(2));
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(2, task_queue.num_tasks());
   EXPECT_EQ(3, task_queue.size());
 
-  task_queue.AddTask(std::make_unique<FakeTask>(3));
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
   EXPECT_FALSE(task_queue.empty());
   EXPECT_EQ(3, task_queue.num_tasks());
   EXPECT_EQ(6, task_queue.size());
@@ -208,6 +210,42 @@ TEST(TaskQueueTest, RemoveAllTasksWhenArgGreaterThanTaskSize) {
   EXPECT_EQ(0, task_queue.size());
 }
 
+TEST(TaskQueueTest, EarliestStartTimeWithEmptyQueue) {
+  TaskQueue<FakeTask> task_queue;
+  EXPECT_FALSE(task_queue.EarliestTaskStartTime().has_value());
+}
+
+TEST(TaskQueueTest, EarliestStartTimeWithMultipleTasksInQueue) {
+  TaskQueue<FakeTask> task_queue;
+
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
+
+  std::optional<uint64_t> result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 1);
+}
+
+TEST(TaskQueueTest, EarliestStartTimeAfterTaskRemoval) {
+  TaskQueue<FakeTask> task_queue;
+
+  task_queue.AddTask(std::make_unique<FakeTask>(1), 1);
+  task_queue.AddTask(std::make_unique<FakeTask>(2), 2);
+  task_queue.AddTask(std::make_unique<FakeTask>(3), 3);
+
+  std::optional<uint64_t> result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 1);
+
+  EXPECT_THAT(task_queue.RemoveTask(3),
+              ElementsAre(Pointee(Property(&FakeTask::size, Eq(1))),
+                          Pointee(Property(&FakeTask::size, Eq(2)))));
+
+  result = task_queue.EarliestTaskStartTime();
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(*result, 3);
+}
+
 TEST(BatchTest, Basic) {
   Batch<FakeTask> batch;
 
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index d1d8551c250bea..7d0c2d66354e00 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -31,6 +31,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/time/clock.h"
 #include "tensorflow/core/kernels/batching_util/batch_input_task.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/context_types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
+#include "tsl/platform/criticality.h"
 #include "tsl/platform/errors.h"
 
 namespace tensorflow {
@@ -252,8 +254,12 @@ class SharedBatchScheduler
       size_t input_batch_size_limit = 0;
       // See QueueOptions.max_enqueued_batches
       size_t max_enqueued_batches = 0;
+      // See QueueOptions.allowed_batch_sizes
+      std::vector<int32> allowed_batch_sizes;
     };
-    // A subset of queue options for high priority input.
+    // A subset of queue options for high priority input. These options are
+    // currently not being used in favor of the equivalents options at the
+    // QueueOptions level.
     PriorityQueueOptions high_priority_queue_options;
     // A subset of queue options for low priority input.
     PriorityQueueOptions low_priority_queue_options;
@@ -436,6 +442,15 @@ class Queue {
   // Same as IsEmpty(), but assumes the caller already holds a lock on 'mu_'.
   bool IsEmptyInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns true iff the task is a low priority task based on the queue option.
+  bool IsLowPriorityTask(std::unique_ptr<TaskType>* task);
+
+  // Implementation of ScheduleWithoutOrEagerSplit above. Enqueues `task` as it
+  // is or split it inline (eagerly) to form batches to be processed by
+  // `Queue<TaskType>::ProcessBatch`
+  Status ScheduleWithoutOrEagerSplitImpl(std::unique_ptr<TaskType>* task)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Closes the open batch residing at the back of std::deque, and inserts a
   // fresh open batch behind it.
   void StartNewBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -455,6 +470,12 @@ class Queue {
   bool IsOpenBatchSchedulableAfterEagerSplit() const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Determines whether the low priority tasks in `low_priority_tasks_` can form
+  // a batch on their own. If yes, returns a batch that is ready to be
+  // processed. Otherwise, returns an empty unique_ptr.
+  std::unique_ptr<Batch<TaskType>> ScheduleLowPriorityBatch()
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Same as SchedulingCapacity(), but assumes the caller already holds a
   // lock on 'mu_'.
   size_t SchedulingCapacityInternal() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -465,6 +486,15 @@ class Queue {
   Status ValidateBatchTaskQueueCapacity(TaskType* task) const
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns an error if the low priority task queue doesn't have capacity for
+  // this task using the low priority batch options. Since the low priority
+  // tasks are not batched until they get scheduled, it only checks that a
+  // single task does not it exceed input batch size limit and the total size of
+  // the tasks in the queue does not exceed the max batch size * max enqueued
+  // batch sizes.
+  Status ValidateLowPriorityTaskQueueCapacity(const TaskType& task) const
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // The task size of the last batch in the queue.
   size_t tail_batch_task_size() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
@@ -874,11 +904,6 @@ Queue<TaskType>::~Queue() {
 
 template <typename TaskType>
 Status Queue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  if ((*task)->size() > options_.input_batch_size_limit) {
-    return errors::InvalidArgument("Task size ", (*task)->size(),
-                                   " is larger than maximum input batch size ",
-                                   options_.input_batch_size_limit);
-  }
   if (options_.enable_lazy_split) {
     return ScheduleWithLazySplit(std::move(task));
   }
@@ -949,6 +974,72 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
   return absl::OkStatus();
 }
 
+template <typename TaskType>
+bool Queue<TaskType>::IsLowPriorityTask(std::unique_ptr<TaskType>* task) {
+  if (!options_.enable_priority_queue) {
+    return false;
+  }
+
+  // The criticality is defined only when the task is a derived class of
+  // BatchTask.
+  if constexpr (std::is_base_of_v<BatchTask, TaskType>) {
+    // TODO(b/316379576): Make the criticality and priority configurable.
+    return ((*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddablePlus ||
+            (*task)->criticality() ==
+                tsl::criticality::Criticality::kSheddable);
+  }
+
+  // Otherwise, consider it a high priority task and return false.
+  return false;
+}
+
+template <typename TaskType>
+Status Queue<TaskType>::ScheduleWithoutOrEagerSplitImpl(
+    std::unique_ptr<TaskType>* task) {
+  // TODO(b/161857471):
+  // Add test coverage when when concurrent incoming batches arrives and
+  // use up all queue capacity.
+  TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
+
+  std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
+
+  const int64_t open_batch_remaining_slot =
+      max_execution_batch_size() - batches.back()->size();
+
+  const int64_t input_task_size = (*task)->size();
+
+  std::vector<std::unique_ptr<TaskType>> output_tasks;
+
+  if (input_task_size <= open_batch_remaining_slot ||
+      !options_.enable_large_batch_splitting) {
+    // This is the fast path when input doesn't need to be split.
+    output_tasks.push_back(std::move(*task));
+  } else {
+    TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
+  }
+
+  for (int i = 0; i < output_tasks.size(); ++i) {
+    if (batches.back()->size() + output_tasks[i]->size() >
+        max_execution_batch_size()) {
+      StartNewBatch();
+    }
+    if (batches.back()->empty()) {
+      open_batch_start_time_micros_ = env_->NowMicros();
+    }
+    profiler::TraceMeProducer trace_me(
+        [&output_tasks, i] {
+          return profiler::TraceMeEncode("ScheduleOutputTask",
+                                         {{"size", output_tasks[i]->size()}});
+        },
+        profiler::ContextType::kSharedBatchScheduler,
+        batches.back()->traceme_context_id());
+    batches.back()->AddTask(std::move(output_tasks[i]));
+  }
+
+  return absl::OkStatus();
+}
+
 // TODO(b/194294263):
 // Merge `ScheduleWithoutOrEagerSplit` and `ScheduleWithLazySplit` into
 // `Schedule`.
@@ -969,48 +1060,19 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
 
     DCHECK(!closed_);
 
-    // TODO(b/161857471):
-    // Add test coverage when when concurrent incoming batches arrives and
-    // use up all queue capacity.
-    TF_RETURN_IF_ERROR(ValidateBatchTaskQueueCapacity((*task).get()));
-
-    std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
-
-    const int64_t open_batch_remaining_slot =
-        max_execution_batch_size() - batches.back()->size();
-
-    const int64_t input_task_size = (*task)->size();
-
-    std::vector<std::unique_ptr<TaskType>> output_tasks;
-
-    if (input_task_size <= open_batch_remaining_slot ||
-        !large_batch_splitting) {
-      // This is the fast path when input doesn't need to be split.
-      output_tasks.push_back(std::move(*task));
+    if (IsLowPriorityTask(task)) {
+      // Insert the task to the low priority task queue instead of the high
+      // priority batch queue below.
+      TF_RETURN_IF_ERROR(ValidateLowPriorityTaskQueueCapacity(**task));
+      low_priority_tasks_.AddTask(std::move(*task), env_->NowMicros());
     } else {
-      TF_RETURN_IF_ERROR(SplitInputBatchIntoSubtasks(task, &output_tasks));
-    }
-
-    for (int i = 0; i < output_tasks.size(); ++i) {
-      if (batches.back()->size() + output_tasks[i]->size() >
-          max_execution_batch_size()) {
-        StartNewBatch();
-      }
-      if (batches.back()->empty()) {
-        open_batch_start_time_micros_ = env_->NowMicros();
-      }
-      profiler::TraceMeProducer trace_me(
-          [&output_tasks, i] {
-            return profiler::TraceMeEncode("ScheduleOutputTask",
-                                           {{"size", output_tasks[i]->size()}});
-          },
-          profiler::ContextType::kSharedBatchScheduler,
-          batches.back()->traceme_context_id());
-      batches.back()->AddTask(std::move(output_tasks[i]));
+      TF_RETURN_IF_ERROR(ScheduleWithoutOrEagerSplitImpl(task));
     }
 
+    // Check if the batch queue has a schedulable batch and mark it schedulable
+    // if it not already marked.
     if (!schedulable_batch_) {
-      if (batches.size() > 1 || IsOpenBatchSchedulable()) {
+      if (GetBatches().size() > 1 || IsOpenBatchSchedulable()) {
         schedulable_batch_ = true;
         notify_of_schedulable_batch = true;
       }
@@ -1038,7 +1100,7 @@ size_t Queue<TaskType>::NumEnqueuedTasks() const {
   for (const auto& batch : GetBatches()) {
     num_enqueued_tasks += batch->num_tasks();
   }
-  return num_enqueued_tasks;
+  return num_enqueued_tasks + low_priority_tasks_.num_tasks();
 }
 
 template <typename TaskType>
@@ -1063,6 +1125,14 @@ size_t Queue<TaskType>::SchedulingCapacityInternal() const {
 
 template <typename TaskType>
 Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
+  // Check if the task size is larger than the batch size limit, regardless of
+  // the batch capacity.
+  if (task->size() > options_.input_batch_size_limit) {
+    return absl::InvalidArgumentError(absl::StrFormat(
+        "Task size %d is larger than maximum input batch size %d", task->size(),
+        options_.input_batch_size_limit));
+  }
+
   // Queue creation requires that `enable_large_batch_splitting` is true
   // when `enable_lazy_split` is true, so this covers both eager split and
   // lazy split.
@@ -1102,6 +1172,36 @@ Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
   return absl::OkStatus();
 }
 
+template <typename TaskType>
+Status Queue<TaskType>::ValidateLowPriorityTaskQueueCapacity(
+    const TaskType& task) const {
+  // Unlike the high priority batch capacity validation where having only
+  // input_batch_size_limit without max_execution_batch_size is allowed, it
+  // doesn't have the backward compatibility check and always assume that
+  // max_execution_batch_size is present.
+  if (task.size() >
+      options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted has "
+        "max_execution_batch_size=%d and the task size is %d",
+        options_.low_priority_queue_options.max_execution_batch_size,
+        task.size()));
+  }
+  if (low_priority_tasks_.size() + task.size() >
+      options_.low_priority_queue_options.max_enqueued_batches *
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    return absl::UnavailableError(absl::StrFormat(
+        "The low priority task queue to which this task was submitted does not "
+        "have the capcity to handle this task; currently the low priority "
+        "queue has %d tasks enqueued and the submitted task size is %d while "
+        "max_enqueued_batches=%d and max_execution_batch_size=%d",
+        low_priority_tasks_.size(), task.size(),
+        options_.low_priority_queue_options.max_enqueued_batches,
+        options_.low_priority_queue_options.max_execution_batch_size));
+  }
+  return absl::OkStatus();
+}
+
 template <typename TaskType>
 std::unique_ptr<Batch<TaskType>>
 Queue<TaskType>::ScheduleBatchWithEagerSplit() {
@@ -1120,14 +1220,26 @@ Queue<TaskType>::ScheduleBatchWithEagerSplit() {
 
     if (batches.size() >= 2) {
       // There is at least one closed batch that is ready to be scheduled.
-      ++num_batches_being_processed_;
       batch_to_schedule = std::move(batches.front());
       batches.pop_front();
-    } else {
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // If there was no schedulable batch in the batch queue, try to schedule
+      // from the low priority task queue.
+      batch_to_schedule = ScheduleLowPriorityBatch();
+    }
+
+    if (batch_to_schedule == nullptr) {
+      // There is neither high nor low priority batch that can be scheduled,
+      // mark the condition false and return the nullptr.
       schedulable_batch_ = false;
+      return batch_to_schedule;
     }
-  }
 
+    // Otherwise, increment the counter and return the batch.
+    ++num_batches_being_processed_;
+  }
   return batch_to_schedule;
 }
 
@@ -1239,7 +1351,7 @@ bool Queue<TaskType>::IsEmptyInternal() const {
   }
   const std::deque<std::unique_ptr<Batch<TaskType>>>& batches = GetBatches();
   return num_batches_being_processed_ == 0 && batches.size() == 1 &&
-         batches.back()->empty();
+         batches.back()->empty() && low_priority_tasks_.empty();
 }
 
 template <typename TaskType>
@@ -1292,6 +1404,38 @@ bool Queue<TaskType>::IsOpenBatchSchedulable() const {
              open_batch_start_time_micros_ + options_.batch_timeout_micros;
 }
 
+template <typename TaskType>
+std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleLowPriorityBatch() {
+  std::unique_ptr<Batch<TaskType>> batch_to_schedule;
+  if (!options_.enable_priority_queue || low_priority_tasks_.empty()) {
+    // Return early if priority queue is disabled or there is no low priority
+    // task.
+    return batch_to_schedule;
+  }
+  if (env_->NowMicros() <
+          *low_priority_tasks_.EarliestTaskStartTime() +
+              options_.low_priority_queue_options.batch_timeout_micros &&
+      low_priority_tasks_.size() <
+          options_.low_priority_queue_options.max_execution_batch_size) {
+    // Return early if the low priority tasks can't fill up the max batch size
+    // and the earliest task didn't time out.
+    return batch_to_schedule;
+  }
+  if (!GetBatches().empty() && !GetBatches().front()->empty()) {
+    // Return early if there is a non-empty high priority batch in the queue.
+    return batch_to_schedule;
+  }
+
+  batch_to_schedule = std::make_unique<Batch<TaskType>>();
+  for (std::unique_ptr<TaskType>& task : low_priority_tasks_.RemoveTask(
+           options_.low_priority_queue_options.max_execution_batch_size)) {
+    batch_to_schedule->AddTask(std::move(task));
+  }
+  batch_to_schedule->Close();
+
+  return batch_to_schedule;
+}
+
 template <typename TaskType>
 size_t Queue<TaskType>::tail_batch_task_size() const {
   if (options_.enable_lazy_split) {
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index a703028a5e6234..0e890a2931c92d 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batching_util/shared_batch_scheduler.h"
 
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT(build/c++11)
@@ -24,6 +25,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/container/fixed_array.h"
+#include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
@@ -37,6 +39,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/criticality.h"
 
 namespace tensorflow {
 namespace serving {
@@ -46,19 +50,43 @@ using ::testing::HasSubstr;
 
 class FakeTask : public BatchTask {
  public:
-  explicit FakeTask(size_t size) : size_(size) {}
+  explicit FakeTask(size_t size, tsl::criticality::Criticality criticality =
+                                     tsl::criticality::Criticality::kCritical)
+      : size_(size), criticality_(criticality) {}
 
   ~FakeTask() override = default;
 
   size_t size() const override { return size_; }
 
+  tsl::criticality::Criticality criticality() const override {
+    return criticality_;
+  }
+
  private:
   const size_t size_;
+  const tsl::criticality::Criticality criticality_;
 
   FakeTask(const FakeTask&) = delete;
   void operator=(const FakeTask&) = delete;
 };
 
+// Fake task taht doesn't inherit BatchTask and doesn't define criticality. The
+// shared batch scheduler should still work with this task.
+class FakeTaskWithoutCriticality {
+ public:
+  explicit FakeTaskWithoutCriticality(size_t size) : size_(size) {}
+
+  ~FakeTaskWithoutCriticality() = default;
+
+  size_t size() const { return size_; }
+
+ private:
+  const size_t size_;
+
+  FakeTaskWithoutCriticality(const FakeTaskWithoutCriticality&) = delete;
+  void operator=(const FakeTaskWithoutCriticality&) = delete;
+};
+
 using Queue = BatchScheduler<FakeTask>;
 using Scheduler = SharedBatchScheduler<FakeTask>;
 using QueueOptions = Scheduler::QueueOptions;
@@ -67,10 +95,26 @@ using SplitFunc =
                          int first_output_task_size, int input_batch_size_limit,
                          std::vector<std::unique_ptr<FakeTask>>* output_tasks)>;
 
-// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
-// that task. Returns the resulting status.
-Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
-  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+// Creates a FakeTask of size 'task_size' and 'criticality', and calls
+// 'scheduler->Schedule()' on that task. Returns the resulting status.
+// 'criticality' defaults to kCritical.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler,
+                    tsl::criticality::Criticality criticality =
+                        tsl::criticality::Criticality::kCritical) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size, criticality));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Helper function similar to the function above. Creates a FakeTask of size
+// 'task_size' and calls 'scheduler->Schedule()' on that task. Returns the
+// resulting status.
+Status ScheduleTaskWithoutCriticality(
+    size_t task_size, BatchScheduler<FakeTaskWithoutCriticality>* scheduler) {
+  std::unique_ptr<FakeTaskWithoutCriticality> task(
+      new FakeTaskWithoutCriticality(task_size));
   Status status = scheduler->Schedule(&task);
   // Schedule() should have consumed 'task' iff it returned Status::OK.
   CHECK_EQ(status.ok(), task == nullptr);
@@ -349,6 +393,101 @@ TEST_P(SharedBatchSchedulerTest,
   EXPECT_TRUE(queue_1_callback_called);
 }
 
+// The task in the shared batch scheduler template parameter does not define
+// criticality priority queue. It should work as if the priority queue is
+// disabled.
+TEST_P(
+    SharedBatchSchedulerTest,
+    CallbackWithTaskVectorOkWithPriorityQueueEnabledWithCriticalitylessTask) {
+  bool queue_0_callback_called = false;
+  auto queue_0_callback =
+      [&queue_0_callback_called](
+          std::unique_ptr<Batch<FakeTaskWithoutCriticality>> batch,
+          std::vector<std::unique_ptr<FakeTaskWithoutCriticality>> tasks) {
+        queue_0_callback_called = true;
+        ASSERT_TRUE(batch->IsClosed());
+        ASSERT_EQ(3, batch->num_tasks());
+        EXPECT_EQ(1, batch->task(0).size());
+        EXPECT_EQ(3, batch->task(1).size());
+        EXPECT_EQ(5, batch->task(2).size());
+        EXPECT_EQ(0, tasks.size());
+      };
+  bool queue_1_callback_called = false;
+  auto queue_1_callback =
+      [&queue_1_callback_called](
+          std::unique_ptr<Batch<FakeTaskWithoutCriticality>> batch,
+          std::vector<std::unique_ptr<FakeTaskWithoutCriticality>> tasks) {
+        queue_1_callback_called = true;
+        ASSERT_TRUE(batch->IsClosed());
+        ASSERT_EQ(2, batch->num_tasks());
+        EXPECT_EQ(2, batch->task(0).size());
+        EXPECT_EQ(4, batch->task(1).size());
+        EXPECT_EQ(0, tasks.size());
+      };
+  {
+    SharedBatchScheduler<FakeTaskWithoutCriticality>::Options options;
+    options.num_batch_threads = 3;
+    options.env = Env::Default();
+
+    std::shared_ptr<SharedBatchScheduler<FakeTaskWithoutCriticality>>
+        shared_batch_scheduler;
+    TF_CHECK_OK(SharedBatchScheduler<FakeTaskWithoutCriticality>::Create(
+        options, &shared_batch_scheduler));
+
+    // Create two queues.
+
+    SharedBatchScheduler<FakeTaskWithoutCriticality>::QueueOptions
+        queue_options;
+    queue_options.input_batch_size_limit = 10;
+    queue_options.batch_timeout_micros = 1000 * 1000;
+    queue_options.max_enqueued_batches = 2;
+    queue_options.enable_large_batch_splitting = enable_input_batch_split();
+    queue_options.split_input_task_func =
+        [](std::unique_ptr<FakeTaskWithoutCriticality>* input_task,
+           int open_batch_remaining_slot, int max_batch_size,
+           std::vector<std::unique_ptr<FakeTaskWithoutCriticality>>*
+               output_tasks) -> Status {
+      std::unique_ptr<FakeTaskWithoutCriticality> owned_input_task =
+          std::move(*input_task);
+      const int input_task_size = owned_input_task->size();
+
+      const internal::InputSplitMetadata input_split_metadata(
+          input_task_size, open_batch_remaining_slot, max_batch_size);
+
+      const absl::FixedArray<int> task_sizes =
+          input_split_metadata.task_sizes();
+      const int num_batches = task_sizes.size();
+
+      output_tasks->resize(num_batches);
+      for (int i = 0; i < num_batches; i++) {
+        (*output_tasks)[i] =
+            std::make_unique<FakeTaskWithoutCriticality>(task_sizes[i]);
+      }
+
+      return absl::OkStatus();
+    };
+    queue_options.enable_lazy_split = enable_lazy_split();
+    queue_options.max_execution_batch_size = 10;
+    queue_options.enable_priority_queue = true;
+
+    std::unique_ptr<BatchScheduler<FakeTaskWithoutCriticality>> queue_0;
+    TF_CHECK_OK(shared_batch_scheduler->AddQueue(queue_options,
+                                                 queue_0_callback, &queue_0));
+    std::unique_ptr<BatchScheduler<FakeTaskWithoutCriticality>> queue_1;
+    TF_CHECK_OK(shared_batch_scheduler->AddQueue(queue_options,
+                                                 queue_1_callback, &queue_1));
+
+    // Submit tasks to the two queues.
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(1, queue_0.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(2, queue_1.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(3, queue_0.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(4, queue_1.get()));
+    TF_ASSERT_OK(ScheduleTaskWithoutCriticality(5, queue_0.get()));
+  }
+  EXPECT_TRUE(queue_0_callback_called);
+  EXPECT_TRUE(queue_1_callback_called);
+}
+
 TEST_P(SharedBatchSchedulerTest, ObeyBatchSizeConstraint) {
   // Set up a fake clock, which only advances when we explicitly tell it to.
   test_util::FakeClockEnv env(Env::Default());
@@ -912,6 +1051,279 @@ INSTANTIATE_TEST_SUITE_P(
                       std::make_tuple(/*enable_input_batch_split=*/false,
                                       /*enable_lazy_split=*/false)));
 
+using SharedBatchSchedulerPriorityTest = SharedBatchSchedulerTest;
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       InvalidLowPriorityTaskWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 1;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 1;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    EXPECT_THAT(
+        ScheduleTask(10, queue.get(),
+                     tsl::criticality::Criticality::kSheddablePlus),
+        testing::StatusIs(
+            absl::StatusCode::kUnavailable,
+            HasSubstr(
+                "The low priority task queue to which this task was submitted "
+                "has max_execution_batch_size=1 and the task size is 10")));
+  }
+  EXPECT_FALSE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       InvalidLowPriorityTaskWithQueueFullWithPriorityQueueEnabled) {
+  Notification processing, proceed;
+  auto queue_callback = [&processing, &proceed](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    if (!processing.HasBeenNotified()) {
+      processing.Notify();
+    }
+    proceed.WaitForNotification();
+  };
+
+  std::shared_ptr<Scheduler> scheduler =
+      CreateSharedBatchScheduler(/*num_batch_threads=*/1);
+
+  QueueOptions queue_options = CreateQueueOptions(
+      /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+      /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+      /*enable_priority_queue=*/true);
+  queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+  queue_options.low_priority_queue_options.batch_timeout_micros =
+      1 * 1000 * 1000;
+  queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+  queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+  std::unique_ptr<Queue> queue =
+      CreateQueue(scheduler, queue_options, queue_callback);
+
+  // Schedule one task and block the thread.
+  TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                            tsl::criticality::Criticality::kCriticalPlus));
+  TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  processing.WaitForNotification();
+  ASSERT_EQ(0, queue->NumEnqueuedTasks());
+
+  // Adding tasks up to size 20 should be fine.
+  TF_ASSERT_OK(ScheduleTask(10, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  ASSERT_EQ(1, queue->NumEnqueuedTasks());
+  TF_ASSERT_OK(ScheduleTask(10, queue.get(),
+                            tsl::criticality::Criticality::kSheddablePlus));
+  ASSERT_EQ(2, queue->NumEnqueuedTasks());
+
+  // Adding one more task should result in an error.
+  EXPECT_THAT(
+      ScheduleTask(1, queue.get(),
+                   tsl::criticality::Criticality::kSheddablePlus),
+      testing::StatusIs(
+          absl::StatusCode::kUnavailable,
+          HasSubstr("The low priority task queue to which this task was "
+                    "submitted does not have the capcity to handle this task; "
+                    "currently the low priority queue has 20 tasks enqueued "
+                    "and the submitted task size is 1 while "
+                    "max_enqueued_batches=2 and max_execution_batch_size=10")));
+
+  // Unblock the thread.
+  proceed.Notify();
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       CallbackWithTaskVectorOkWithPriorityQueueEnabledWithPrioritySet) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(2, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(1, tasks.size());
+    EXPECT_EQ(5, tasks[0]->size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue enabled.
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 10;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       CallbackWithTaskVectorOkWithPriorityQueueDisabledWithPrioritySet) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_EQ(0, tasks.size());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    // Create a queue with the priority queue disabled.
+    const QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/10, /*input_batch_size_limit=*/10,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/false);
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit tasks to the queue.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kCriticalPlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       LowPriorityTaskOnlyAtMaxBatchSizeWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_TRUE(tasks.empty());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 9;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit low priority tasks to fill up the max batch size.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+TEST_P(SharedBatchSchedulerPriorityTest,
+       LowPriorityTaskOnlyAtTimeoutWithPriorityQueueEnabled) {
+  bool queue_callback_called = false;
+  auto queue_callback = [&queue_callback_called](
+                            std::unique_ptr<Batch<FakeTask>> batch,
+                            std::vector<std::unique_ptr<FakeTask>> tasks) {
+    queue_callback_called = true;
+    ASSERT_TRUE(batch->IsClosed());
+    ASSERT_EQ(3, batch->num_tasks());
+    EXPECT_EQ(1, batch->task(0).size());
+    EXPECT_EQ(3, batch->task(1).size());
+    EXPECT_EQ(5, batch->task(2).size());
+    EXPECT_TRUE(tasks.empty());
+  };
+
+  {
+    std::shared_ptr<Scheduler> scheduler =
+        CreateSharedBatchScheduler(/*num_batch_threads=*/3);
+
+    QueueOptions queue_options = CreateQueueOptions(
+        /*max_execution_batch_size=*/100, /*input_batch_size_limit=*/100,
+        /*batch_timeout_micros=*/1 * 1000 * 1000, /*max_enqueued_batches=*/2,
+        /*enable_priority_queue=*/true);
+    queue_options.low_priority_queue_options.max_execution_batch_size = 20;
+    queue_options.low_priority_queue_options.batch_timeout_micros =
+        1 * 1000 * 1000;
+    queue_options.low_priority_queue_options.input_batch_size_limit = 10;
+    queue_options.low_priority_queue_options.max_enqueued_batches = 2;
+    std::unique_ptr<Queue> queue =
+        CreateQueue(scheduler, queue_options, queue_callback);
+
+    // Submit low priority tasks that wouldn't fill up the max batch size, but
+    // they should still be scheduled due to timeout.
+    TF_ASSERT_OK(ScheduleTask(1, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(3, queue.get(),
+                              tsl::criticality::Criticality::kSheddablePlus));
+    TF_ASSERT_OK(ScheduleTask(5, queue.get(),
+                              tsl::criticality::Criticality::kSheddable));
+  }
+  EXPECT_TRUE(queue_callback_called);
+}
+
+// Lazy split is to be removed. The mixed priority batching is only supported
+// when the lazy split is not enabled.
+INSTANTIATE_TEST_SUITE_P(
+    Parameter, SharedBatchSchedulerPriorityTest,
+    ::testing::Values(std::make_tuple(/*enable_input_batch_split=*/true,
+                                      /*enable_lazy_split=*/false),
+                      std::make_tuple(/*enable_input_batch_split=*/false,
+                                      /*enable_lazy_split=*/false)));
+
 #ifdef PLATFORM_GOOGLE
 // This benchmark relies on https://github.com/google/benchmark features,
 // (in particular, `Benchmark::ThreadRange`) not available in open-sourced TF
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index a680a0e4e7a2e3..1a1e55ed067fd3 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -308,7 +308,7 @@ class DenseBincountOp : public OpKernel {
 
     Tensor* out_t;
     functor::SetZeroFunctor<Device, T> fill;
-    if (data.dims() == 1) {
+    if (data.dims() <= 1) {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 9509792dcd2450..fc902bf6360ade 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -26,6 +26,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:mutex",
     ],
 )
@@ -1160,6 +1161,7 @@ tf_kernel_library(
         "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
@@ -1228,9 +1230,11 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 40021d00e41075..13db60592ab475 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -90,7 +91,11 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     }
 
     random_indexing_compatible_ = absl::OkStatus();
-    if (input_ != nullptr) {
+    if (!drop_remainder_) {
+      random_indexing_compatible_ = absl::FailedPreconditionError(absl::StrCat(
+          type_string(),
+          " does not support global shuffling with `drop_remainder=False`."));
+    } else if (input_ != nullptr) {
       random_indexing_compatible_ = input_->RandomIndexingCompatible();
     }
   }
@@ -153,9 +158,8 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(input_->Get(ctx, i, &batch_element_tuple));
       batch_elements.emplace_back(std::move(batch_element_tuple));
     }
-    TF_RETURN_IF_ERROR(CopyBatch(CopyBatchParams(ctx),
-                                 std::move(batch_elements), parallel_copy_,
-                                 out_tensors));
+    TF_RETURN_IF_ERROR(CopyBatch(AnyContext(ctx), std::move(batch_elements),
+                                 parallel_copy_, out_tensors));
     return absl::OkStatus();
   }
 
@@ -242,8 +246,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       // respective slice locations. This would require a different GetNext()
       // overload that supports zero-copy, and might make sense in an
       // optimization pass.
-      TF_RETURN_IF_ERROR(CopyBatch(CopyBatchParams(ctx),
-                                   std::move(batch_elements),
+      TF_RETURN_IF_ERROR(CopyBatch(AnyContext(ctx), std::move(batch_elements),
                                    dataset()->parallel_copy_, out_tensors));
 
       *end_of_sequence = false;
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index e7f3e2175af489..81f4abfbcb0bba 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -44,6 +44,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/data:name_utils",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -461,8 +462,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:global_shuffle_utils",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index 655ae0d906a112..8760952578dcab 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <map>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -122,6 +123,11 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
         num_elements_++;
       }
       if (*end_of_sequence && num_elements_ != dataset()->cardinality_) {
+        if (ctx->index_mapper()) {
+          return absl::FailedPreconditionError(
+              absl::StrCat("Input dataset was expected to contain ",
+                           ElementString(dataset()->cardinality_), "."));
+        }
         return errors::FailedPrecondition(
             "Input dataset was expected to contain ",
             ElementString(dataset()->cardinality_), " but contained only ",
diff --git a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
index e0cbd047bc945b..ad0006724bd5ef 100644
--- a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
@@ -47,6 +47,7 @@ namespace {
 
 constexpr int32_t kIndexShuffleRounds = 8;
 
+constexpr const char kDatasetType[] = "GlobalShuffle";
 constexpr const char kElementCount[] = "element_count";
 constexpr const char kGlobalShuffleDataset[] = "GlobalShuffleDataset";
 constexpr const char kReshuffleEachIteration[] = "reshuffle_each_iteration";
@@ -105,7 +106,7 @@ class GlobalShuffleDatasetOp::Dataset : public DatasetBase {
   }
 
   std::string DebugString() const override {
-    return name_utils::DatasetDebugString(kGlobalShuffleDataset);
+    return name_utils::DatasetDebugString(kDatasetType);
   }
 
   int64_t CardinalityInternal(CardinalityOptions options) const override {
@@ -340,8 +341,7 @@ std::unique_ptr<IteratorBase>
 GlobalShuffleDatasetOp::Dataset::MakeIteratorInternal(
     const std::string& prefix) const {
   return std::make_unique<GlobalShuffleDatasetOp::Dataset::Iterator>(
-      Iterator::Params{
-          this, name_utils::IteratorPrefix(kGlobalShuffleDataset, prefix)},
+      Iterator::Params{this, name_utils::IteratorPrefix(kDatasetType, prefix)},
       seed_generator_->get());
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
index 8678a687fa3637..c93ccce81b4474 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/experimental/list_dataset_op.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -87,8 +91,17 @@ class ListDatasetOp::Dataset : public DatasetBase {
 
   Status CheckExternalState() const override { return absl::OkStatus(); }
 
-  Status Get(OpKernelContext* ctx, int64 index,
-             std::vector<Tensor>* out_tensors) const override {
+  absl::Status RandomIndexingCompatible() const override {
+    return absl::OkStatus();
+  }
+
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
+                   std::vector<Tensor>* out_tensors) const override {
+    return Get(AnyContext(ctx), index, out_tensors);
+  }
+
+  absl::Status Get(AnyContext ctx, int64_t index,
+                   std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->clear();
     out_tensors->reserve(num_components_);
@@ -126,7 +139,8 @@ class ListDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params),
+          global_shuffle_iterator_(dataset()) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -144,6 +158,11 @@ class ListDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
+      if (ctx->index_mapper() != nullptr) {
+        return global_shuffle_iterator_.GetNext(ctx, out_tensors,
+                                                end_of_sequence);
+      }
+
       Tensor split;
       TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
       if (*end_of_sequence) {
@@ -173,12 +192,16 @@ class ListDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
+      if (ctx->restored_element_count().has_value()) {
+        return global_shuffle_iterator_.Restore(ctx);
+      }
       return split_provider_->Restore(
           [this](const std::string& key) { return full_name(key); }, reader);
     }
 
    private:
     std::shared_ptr<SplitProvider> split_provider_;
+    GlobalShuffleIterator global_shuffle_iterator_;
   };
 
   const std::vector<Tensor> tensors_;
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index 3d2ae0e7f7434a..5c73ed46cdfae0 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -35,7 +35,7 @@ class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
       const string& prefix)
       : wrapped_(stats_aggregator), tag_(tag), prefix_(prefix) {}
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       int64_t steps) override {
     wrapped_->AddToHistogram(TaggedName(name), values, steps);
   }
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 4bb95b087fcb5b..b34ddcf43c8a9b 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -49,7 +49,7 @@ class StatsAggregatorImpl : public StatsAggregator {
  public:
   StatsAggregatorImpl() {}
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
@@ -138,7 +138,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     }
   }
 
-  void AddToHistogram(const string& name, gtl::ArraySlice<double> values,
+  void AddToHistogram(const string& name, absl::Span<const double> values,
                       const int64_t steps) override {
     mutex_lock l(mu_);
     histogram::Histogram& histogram = histograms_[name];
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index f3b7d661b40272..21b7769feb450a 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -423,9 +423,8 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
         Status status;
         {
           mutex_lock l(result->mu);
-          status =
-              CopyBatch(CopyBatchParams(ctx.get()), std::move(batch_elements),
-                        dataset()->parallel_copy_, &result->output);
+          status = CopyBatch(AnyContext(ctx.get()), std::move(batch_elements),
+                             dataset()->parallel_copy_, &result->output);
           result->status.Update(status);
 
           if (result->status.ok()) {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index dc97850f758a66..bf71d672c86bef 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -236,6 +237,11 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
+    return Get(AnyContext(ctx), index, out_tensors);
+  }
+
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     return ConvertOutputTypes(output_dtypes(), out_tensors,
                               start_ + (index * step_));
@@ -265,7 +271,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params),
+          global_shuffle_iterator_(dataset()) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -284,7 +291,8 @@ class RangeDatasetOp::Dataset : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       if (ctx->index_mapper() != nullptr) {
-        return Get(ctx, out_tensors, end_of_sequence);
+        return global_shuffle_iterator_.GetNext(ctx, out_tensors,
+                                                end_of_sequence);
       }
       int64_t value;
       if (split_provider_ != nullptr) {
@@ -304,20 +312,6 @@ class RangeDatasetOp::Dataset : public DatasetBase {
       return ConvertOutputTypes(output_dtypes(), out_tensors, value);
     }
 
-    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                     bool* end_of_sequence) {
-      tsl::mutex_lock l(mu_);
-      if (element_count_ >=
-          (dataset()->stop_ - dataset()->start_) / dataset()->step_) {
-        *end_of_sequence = true;
-        return absl::OkStatus();
-      }
-      size_t output_index = ctx->index_mapper()(element_count_++);
-      int64_t value = dataset()->start_ + output_index * dataset()->step_;
-      *end_of_sequence = false;
-      return ConvertOutputTypes(output_dtypes(), out_tensors, value);
-    }
-
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -344,9 +338,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       if (ctx->restored_element_count().has_value()) {
-        tsl::mutex_lock l(mu_);
-        element_count_ = *(ctx->restored_element_count());
-        return absl::OkStatus();
+        return global_shuffle_iterator_.Restore(ctx);
       }
       if (reader->Contains(prefix(), kHasSplitProvider)) {
         TF_RETURN_IF_ERROR(split_provider_->Restore(
@@ -369,11 +361,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
    private:
     std::unique_ptr<RangeCounter> counter_;
     std::shared_ptr<SplitProvider> split_provider_;
-
-    mutable tsl::mutex mu_;
-    // Count of elements produced by this iterator when it runs in the random
-    // access mode.
-    size_t element_count_ TF_GUARDED_BY(mu_) = 0;
+    GlobalShuffleIterator global_shuffle_iterator_;
   };
 
   const int64_t start_;
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 25957f96ba5d94..bda0c9b6ed7297 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -51,6 +52,12 @@ namespace data {
 
 constexpr char kInputImplEmpty[] = "input_impl_empty";
 constexpr char kNextIndex[] = "next_index";
+constexpr char kFileShardErrorMessage[] =
+    "If you are using datasets with distribution strategy, consider setting "
+    "the auto sharding policy to either DATA or OFF using the "
+    "`experimental_distribute.auto_shard_policy` option of `tf.data.Options()`."
+    " Or, split your input files into a larger number of small files such that "
+    "number of files is greater than number of shards/workers.";
 
 class ShardDatasetOp::Dataset : public DatasetBase {
  public:
@@ -145,7 +152,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params), next_index_(0) {}
+        : DatasetIterator<Dataset>(params), next_index_(0), element_count_(0) {}
 
     bool SymbolicCheckpointCompatible() const override { return true; }
 
@@ -172,10 +179,9 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         return absl::OkStatus();
       }
 
-      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
-      auto merge_checkpoint = gtl::MakeCleanup([&ctx_with_index_mapper] {
-        ctx_with_index_mapper.MergeCheckpoint();
-      });
+      if (ctx->index_mapper() != nullptr) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
 
       int num_to_skip =
           (dataset()->index_ - next_index_) % dataset()->num_shards_;
@@ -183,9 +189,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         num_to_skip += dataset()->num_shards_;
       }
       int num_skipped;
-      TF_RETURN_IF_ERROR(input_impl_->Skip(ctx_with_index_mapper.Get(),
-                                           num_to_skip, end_of_sequence,
-                                           &num_skipped));
+      TF_RETURN_IF_ERROR(
+          input_impl_->Skip(ctx, num_to_skip, end_of_sequence, &num_skipped));
       next_index_ += num_skipped;
       if (*end_of_sequence) {
         input_impl_.reset();
@@ -193,8 +198,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       }
 
       std::vector<Tensor> result;
-      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
-                                              &result, end_of_sequence));
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
       if (*end_of_sequence) {
         input_impl_.reset();
         return absl::OkStatus();
@@ -204,24 +208,17 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       if (dataset()->require_non_empty_ &&
           next_index_ < dataset()->num_shards_) {
         int num_skipped;
-        Status s = input_impl_->Skip(ctx_with_index_mapper.Get(),
-                                     dataset()->num_shards_ - next_index_,
+        Status s = input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
                                      end_of_sequence, &num_skipped);
         if (*end_of_sequence || errors::IsOutOfRange(s)) {
           // `dataset()->require_non_empty_` implies that this transformation
           // was introduced by auto_sharding rewrite, so it's acceptable
           // produce an error message that assumes auto-sharding context.
-          return errors::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Could not apply FILE based sharding: the dataset only has ",
               next_index_, " file(s), which is not enough for the required ",
-              dataset()->num_shards_,
-              " shards/workers."
-              "If you are using datasets with distribution strategy, "
-              "consider setting the auto sharding policy to either DATA or "
-              "OFF using the `experimental_distribute.auto_shard_policy` option"
-              "of `tf.data.Options()`. Or, split your input files into a "
-              "larger number of small files such that number of files is "
-              "greater than number of shards/workers.");
+              dataset()->num_shards_, " shards/workers. ",
+              kFileShardErrorMessage));
         } else if (!s.ok()) {
           return s;
         }
@@ -233,16 +230,37 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       return absl::OkStatus();
     }
 
-    IndexMapperFn GetIndexMapper(IndexMapperFn parent_index_mapper)
-        const override TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+               bool* end_of_sequence) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
+      auto merge_checkpoint = gtl::MakeCleanup([&ctx_with_index_mapper] {
+        ctx_with_index_mapper.MergeCheckpoint();
+      });
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
+                                              out_tensors, end_of_sequence));
+      if (*end_of_sequence && dataset()->require_non_empty_ &&
+          element_count_ == 0) {
+        // `dataset()->require_non_empty_` implies that this transformation
+        // was introduced by auto_sharding rewrite, so it's acceptable to
+        // produce an error message that assumes auto-sharding context.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Could not apply FILE based sharding: The dataset does not have "
+            "enough file(s) for the required ",
+            dataset()->num_shards_, " shards/workers. ",
+            kFileShardErrorMessage));
+      }
+      ++element_count_;
+      return absl::OkStatus();
+    }
+
+    IndexMapperFn GetIndexMapper(
+        IndexMapperFn parent_index_mapper) const override {
       int64_t num_shards = dataset()->num_shards_;
-      return [parent_index_mapper,
-              num_shards](size_t element_position) -> size_t {
-        size_t sharded_element_position = element_position / num_shards;
-        size_t input_element_offset = element_position % num_shards;
-        size_t shuffled_element_position =
-            parent_index_mapper(sharded_element_position);
-        return shuffled_element_position * num_shards + input_element_offset;
+      int64_t shard_index = dataset()->index_;
+      return [parent_index_mapper, num_shards,
+              shard_index](size_t element_position) -> size_t {
+        size_t output_index = parent_index_mapper(element_position);
+        return output_index * num_shards + shard_index;
       };
     }
 
@@ -270,13 +288,8 @@ class ShardDatasetOp::Dataset : public DatasetBase {
                            IteratorStateReader* reader) override {
       mutex_lock l(mu_);
       if (ctx->restored_element_count().has_value()) {
-        next_index_ = *ctx->restored_element_count() * dataset()->num_shards_;
-        IteratorContext::Params params(ctx);
-        params.restored_element_count =
-            *ctx->restored_element_count() * dataset()->num_shards_;
-        IteratorContext ctx_with_restored_element_count(params);
-        return RestoreInput(&ctx_with_restored_element_count, reader,
-                            input_impl_);
+        element_count_ = *ctx->restored_element_count();
+        return RestoreInput(ctx, reader, input_impl_);
       }
 
       int64_t input_empty;
@@ -300,6 +313,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     mutex mu_;
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     int64_t next_index_ TF_GUARDED_BY(mu_);
+    size_t element_count_ TF_GUARDED_BY(mu_);
   };
 
   const int64_t num_shards_;
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 2a0c75f4c54b93..1e2d000865e844 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -14,9 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/skip_dataset_op.h"
 
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -40,6 +48,14 @@ class SkipDatasetOp::Dataset : public DatasetBase {
   Dataset(OpKernelContext* ctx, int64_t count, const DatasetBase* input)
       : DatasetBase(DatasetContext(ctx)), count_(count), input_(input) {
     input_->Ref();
+    if (input_ != nullptr && count >= 0) {
+      random_indexing_compatible_ = input_->RandomIndexingCompatible();
+    } else {
+      random_indexing_compatible_ = absl::FailedPreconditionError(
+          absl::StrCat("Global shuffling does not support empty dataset or "
+                       "skipping the entire dataset. Got skip(",
+                       count, ")."));
+    }
   }
 
   ~Dataset() override { input_->Unref(); }
@@ -90,6 +106,10 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return input_->Get(ctx, index + count_, out_tensors);
   }
 
+  absl::Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -149,8 +169,11 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+      if (ctx->index_mapper() != nullptr) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
 
+      mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
       if (!input_impl_) {
         *end_of_sequence = true;
         return absl::OkStatus();
@@ -177,6 +200,30 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       return absl::OkStatus();
     }
 
+    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) {
+      mutex_lock l(mu_);
+      if (!input_impl_) {
+        *end_of_sequence = true;
+        return absl::OkStatus();
+      }
+
+      IteratorContextWithIndexMapper ctx_with_index_mapper(ctx, this);
+      TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx_with_index_mapper.Get(),
+                                              out_tensors, end_of_sequence));
+      ctx_with_index_mapper.MergeCheckpoint();
+      return absl::OkStatus();
+    }
+
+    IndexMapperFn GetIndexMapper(
+        IndexMapperFn parent_index_mapper) const override {
+      int64_t skip_count = dataset()->count_;
+      return
+          [parent_index_mapper, skip_count](size_t element_position) -> size_t {
+            return parent_index_mapper(element_position) + skip_count;
+          };
+    }
+
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -198,6 +245,11 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
+      if (ctx->restored_element_count().has_value()) {
+        mutex_lock l(mu_);
+        return RestoreInput(ctx, reader, input_impl_);
+      }
+
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kCurIndex, &i_));
       int64_t input_empty;
@@ -219,6 +271,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
   const int64_t count_;
   const DatasetBase* const input_;
+  absl::Status random_indexing_compatible_;
 };
 
 SkipDatasetOp::SkipDatasetOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 2f3dec08deebe1..02736e5a0ddd1a 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
@@ -88,10 +89,11 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     *out_tensors = tensors_;
     return absl::OkStatus();
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 9d3c4f3f1eb1f9..dad1e8ce9e9950 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/data/global_shuffle_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -100,10 +101,11 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
-    return Get(index, out_tensors);
+    return Get(AnyContext(ctx), index, out_tensors);
   }
 
-  Status Get(int64 index, std::vector<Tensor>* out_tensors) const override {
+  Status Get(AnyContext ctx, int64 index,
+             std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->clear();
     out_tensors->reserve(tensors_.size());
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index 8a79d0817d4126..50329ad8db720f 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -252,7 +252,7 @@ Status WriteField(const FieldDescriptor& field_desc, const Tensor& input,
       WireFormatLite::FieldType(field_desc.type()));
 
   auto input_t = input.flat_inner_dims<TensorT>();
-  if (field_desc.options().packed()) {
+  if (field_desc.is_packed()) {
     // Write the tag for the packed field.
     WireFormatLite::WriteTag(field_desc.number(),
                              WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index a4cfdd0f766ac4..b7ea4c9f9bf592 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -249,8 +249,8 @@ class ParseExampleOp : public OpKernel {
                             example::Result* result) const {
     auto serialized_t = serialized->flat<tstring>();
     auto names_t = names->flat<tstring>();
-    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<tstring> names_slice(names_t.data(), names_t.size());
+    absl::Span<const tstring> slice(serialized_t.data(), serialized_t.size());
+    absl::Span<const tstring> names_slice(names_t.data(), names_t.size());
     return FastParseExample(
         config, slice, names_slice,
         ctx->device()->tensorflow_cpu_worker_threads()->workers, result);
@@ -470,9 +470,9 @@ class ParseSequenceExampleOp : public OpKernel {
     bool is_batch = TensorShapeUtils::IsVector(serialized->shape());
     auto serialized_t = serialized->flat<tstring>();
     auto debug_name_t = debug_name->flat<tstring>();
-    gtl::ArraySlice<tstring> slice(serialized_t.data(), serialized_t.size());
-    gtl::ArraySlice<tstring> names_slice(debug_name_t.data(),
-                                         debug_name_t.size());
+    absl::Span<const tstring> slice(serialized_t.data(), serialized_t.size());
+    absl::Span<const tstring> names_slice(debug_name_t.data(),
+                                          debug_name_t.size());
 
     example::Result context_result, feature_list_result;
     std::vector<Tensor> dense_feature_lengths;
@@ -575,21 +575,21 @@ class ParseSequenceExampleOp : public OpKernel {
       const OpInputList& context_dense_defaults) const {
     // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
     // them in each loop iteration.
-    gtl::ArraySlice<tstring> dense_keys_slice =
+    absl::Span<const tstring> dense_keys_slice =
         dense_keys
-            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
-                                       attrs_.num_context_dense)
+            ? absl::Span<const tstring>(dense_keys->flat<tstring>().data(),
+                                        attrs_.num_context_dense)
             : attrs_.context_dense_keys;
-    gtl::ArraySlice<tstring> sparse_keys_slice =
+    absl::Span<const tstring> sparse_keys_slice =
         sparse_keys
-            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
-                                       attrs_.num_context_sparse)
+            ? absl::Span<const tstring>(sparse_keys->flat<tstring>().data(),
+                                        attrs_.num_context_sparse)
             : attrs_.context_sparse_keys;
-    gtl::ArraySlice<tstring> ragged_keys_slice =
+    absl::Span<const tstring> ragged_keys_slice =
         ragged_keys
-            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
-                                       attrs_.num_context_ragged)
-            : gtl::ArraySlice<tstring>(nullptr, 0);
+            ? absl::Span<const tstring>(ragged_keys->flat<tstring>().data(),
+                                        attrs_.num_context_ragged)
+            : absl::Span<const tstring>(nullptr, 0);
 
     example::FastParseExampleConfig config;
     config.dense.reserve(attrs_.num_context_dense);
@@ -634,29 +634,29 @@ class ParseSequenceExampleOp : public OpKernel {
       const Tensor* feature_list_dense_missing_assumed_empty) const {
     // Convert the tensors/attrs to ArraySlices once, instead of re-evaluating
     // them in each loop iteration.
-    gtl::ArraySlice<tstring> dense_keys_slice =
+    absl::Span<const tstring> dense_keys_slice =
         dense_keys
-            ? gtl::ArraySlice<tstring>(dense_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_dense)
+            ? absl::Span<const tstring>(dense_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_dense)
             : attrs_.feature_list_dense_keys;
-    gtl::ArraySlice<tstring> sparse_keys_slice =
+    absl::Span<const tstring> sparse_keys_slice =
         sparse_keys
-            ? gtl::ArraySlice<tstring>(sparse_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_sparse)
+            ? absl::Span<const tstring>(sparse_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_sparse)
             : attrs_.feature_list_sparse_keys;
-    gtl::ArraySlice<tstring> ragged_keys_slice =
+    absl::Span<const tstring> ragged_keys_slice =
         ragged_keys
-            ? gtl::ArraySlice<tstring>(ragged_keys->flat<tstring>().data(),
-                                       attrs_.num_feature_list_ragged)
-            : gtl::ArraySlice<tstring>(nullptr, 0);
+            ? absl::Span<const tstring>(ragged_keys->flat<tstring>().data(),
+                                        attrs_.num_feature_list_ragged)
+            : absl::Span<const tstring>(nullptr, 0);
     // Use an empty slice to indicate that the map in attrs_ should be used
     // instead.
-    gtl::ArraySlice<bool> feature_list_dense_missing_assumed_empty_slice =
+    absl::Span<const bool> feature_list_dense_missing_assumed_empty_slice =
         feature_list_dense_missing_assumed_empty
-            ? gtl::ArraySlice<bool>(
+            ? absl::Span<const bool>(
                   feature_list_dense_missing_assumed_empty->flat<bool>().data(),
                   attrs_.num_feature_list_dense)
-            : gtl::ArraySlice<bool>(nullptr, 0);
+            : absl::Span<const bool>(nullptr, 0);
 
     example::FastParseExampleConfig config;
     config.dense.reserve(attrs_.num_feature_list_dense);
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 9f832ce9915189..3d4024f48377e5 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -53,8 +53,8 @@ class QuantOpsTest : public OpsTestBase {
   void RunTestFakeQuantWithMinMaxArgs(const int num_bits,
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
-                                      const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data,
+                                      const absl::Span<const float> data,
+                                      absl::Span<const float> expected_data,
                                       const double atol = -1.0,
                                       const double rtol = -1.0,
                                       const DeviceType device = DEVICE_CPU) {
@@ -88,8 +88,8 @@ class QuantOpsTest : public OpsTestBase {
   void RunTestFakeQuantWithMinMaxVars(const int num_bits,
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
-                                      const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data,
+                                      const absl::Span<const float> data,
+                                      absl::Span<const float> expected_data,
                                       const double atol = -1.0,
                                       const double rtol = -1.0,
                                       const DeviceType device = DEVICE_CPU) {
@@ -125,9 +125,9 @@ class QuantOpsTest : public OpsTestBase {
 
   void RunTestFakeQuantWithMinMaxVarsPerChannel(
       const int num_bits, const bool narrow_range,
-      const TensorShape& minmax_shape, const gtl::ArraySlice<float> min,
-      const gtl::ArraySlice<float> max, const TensorShape& shape,
-      const gtl::ArraySlice<float> data, gtl::ArraySlice<float> expected_data,
+      const TensorShape& minmax_shape, const absl::Span<const float> min,
+      const absl::Span<const float> max, const TensorShape& shape,
+      const absl::Span<const float> data, absl::Span<const float> expected_data,
       const double atol = -1.0, const double rtol = -1.0,
       const DeviceType device = DEVICE_CPU) {
     if (device == DEVICE_GPU) {
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 2a4d470c4136f0..23d6062a8519a3 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib", "tf_oss_fuzz_corpus", "tf_oss_fuzz_dict")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index 9ad554620faf34..38c741f9844df0 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -8,11 +7,12 @@ load(
     "tf_cc_tests",
     "tf_copts",
 )
+load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_disable_ptxas_warning_flags", "tf_kernel_library")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 # TODO(rmlarsen): Remove ASAP.
 package_group(
@@ -206,7 +206,7 @@ tf_kernel_library(
     prefix = "decode_image_op",
     deps = IMAGE_DEPS + [
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
@@ -455,7 +455,7 @@ cc_library(
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 2ca9f67e17aca1..afb653191e3e8a 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "absl/strings/match.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/tstring.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index 9538f08a7640bb..3814ca3213c93e 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -1,14 +1,14 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 90f03fe7bd7780..fec3ebab2aa27f 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -400,7 +400,7 @@ class TensorListConcat : public OpKernel {
       OP_REQUIRES(c, !dim_sizes.empty(),
                   errors::InvalidArgument("element_shape must not be empty"));
       element_shape_except_first_dim =
-          PartialTensorShape(gtl::ArraySlice<int64_t>(dim_sizes).subspan(1));
+          PartialTensorShape(absl::Span<const int64_t>(dim_sizes).subspan(1));
     }
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
@@ -459,7 +459,7 @@ class TensorListConcat : public OpKernel {
               errors::InvalidArgument("Concat saw a scalar shape at index ", i,
                                       " but requires at least vectors."));
           TensorShape shape_except_first_dim = TensorShape(
-              gtl::ArraySlice<int64_t>(t.shape().dim_sizes()).subspan(1));
+              absl::Span<const int64_t>(t.shape().dim_sizes()).subspan(1));
           OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
                                           &element_shape_except_first_dim));
           OP_REQUIRES(c, first_dim == -1 || first_dim == t.shape().dim_size(0),
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 8ba44782a194c0..fdb85fda2d70a0 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <chrono>
 #include <thread>
 
+#include "xla/tsl/util/determinism_test_util.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/status_matchers.h"
-#include "tsl/util/determinism_test_util.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index 063c3d19619d8d..cc838aace88f33 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void WriteCheckpoint(const string& prefix, gtl::ArraySlice<string> names,
-                     gtl::ArraySlice<Tensor> tensors) {
+void WriteCheckpoint(const string& prefix, absl::Span<const string> names,
+                     absl::Span<const Tensor> tensors) {
   BundleWriter writer(Env::Default(), prefix);
   ASSERT_TRUE(names.size() == tensors.size());
   for (size_t i = 0; i < names.size(); ++i) {
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index e8a9fffec482fb..30852610ebe147 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -1,12 +1,12 @@
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "mkl_deps",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test_mkl",
     "tf_mkl_kernel_library",
 )
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "mkl_deps",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 7bfe75ec9baf76..922e6464663bb5 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -924,10 +924,12 @@ class MklMatMulPrimitive : public MklPrimitive {
     // Create MatMul descriptor and primitive descriptor.
     if constexpr (CSR) {
       // If it's a CSR matrix.
+#ifdef ENABLE_ONEDNN_V3
       const auto tmp = memory::desc::csr(
           params.a_dims, MklDnnType<Tlhs>(), params.a_nnz,
           dnnl::memory::data_type::s32, dnnl::memory::data_type::s32);
       context_.a_md.reset(new memory::desc(tmp));
+#endif  // ENABLE_ONEDNN_V3
     } else {
       context_.a_md.reset(new memory::desc({params.a_dims}, MklDnnType<Tlhs>(),
                                            params.a_strides));
diff --git a/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc b/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
index 20ef908e9b9521..6f434c588bb4cf 100644
--- a/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
+++ b/tensorflow/core/kernels/mkl/mkl_sparse_matrix_matmul_op_benchmark.cc
@@ -1,6 +1,5 @@
 /* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
 #include <random>
 
@@ -196,4 +195,4 @@ BM_SparseMatrixMatmul(3200, 8, 4000000, 100);
 }  // namespace
 }  // end namespace tensorflow
 
-#endif
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 89ff3313339315..a6bbec78a28712 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -1,15 +1,15 @@
 """Generates cubin headers for TF dialect ops."""
 
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
-load(
-    "@local_xla//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "rocm_gpu_architectures",
 )
-load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
+load(
+    "@local_xla//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
diff --git a/tensorflow/core/kernels/numeric_options_utils.h b/tensorflow/core/kernels/numeric_options_utils.h
index d9ea889b443a87..eb1d50ae7f80bc 100644
--- a/tensorflow/core/kernels/numeric_options_utils.h
+++ b/tensorflow/core/kernels/numeric_options_utils.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_NUMERIC_OPTIONS_UTILS_H_
 
 #include "xla/stream_executor/numeric_options.h"
+#include "xla/tsl/util/determinism.h"
 #include "tsl/platform/tensor_float_32_utils.h"
-#include "tsl/util/determinism.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 57b46a0a06f48e..fb0dfd553430ac 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -394,7 +394,7 @@ Status PaddingFIFOQueue::SetElementZero(Tensor* element) {
 }
 
 std::vector<TensorShape> PaddingFIFOQueue::ConvertShapesPartialDimensionsToZero(
-    const gtl::ArraySlice<PartialTensorShape>& partial_shapes) {
+    const absl::Span<const PartialTensorShape>& partial_shapes) {
   std::vector<TensorShape> shapes(partial_shapes.size());
   for (size_t i = 0; i < shapes.size(); ++i) {
     const PartialTensorShape& partial = partial_shapes[i];
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 939e04dd5e769b..15b25efbc6f87e 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -57,7 +57,7 @@ class PaddingFIFOQueue : public FIFOQueue {
   // Any unknown dimension sizes are converted to 0.
   // REQUIRED: All the input shapes have well defined rank.
   static std::vector<TensorShape> ConvertShapesPartialDimensionsToZero(
-      const gtl::ArraySlice<PartialTensorShape>& partial_shapes);
+      const absl::Span<const PartialTensorShape>& partial_shapes);
 
   // Sets the values in the given element to zero.
   static Status SetElementZero(Tensor* element);
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 1b037b76d2b047..1a82a793f615c5 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -45,7 +45,7 @@ void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
   }
 
   Tensor i_tensor =
-      tensorflow::test::AsTensor(gtl::ArraySlice<qint32>(values_quantized));
+      tensorflow::test::AsTensor(absl::Span<const qint32>(values_quantized));
   Tensor o_tensor(DT_QUINT8, TensorShape{values_count});
   auto output_values = o_tensor.flat<quint8>();
 
@@ -87,7 +87,7 @@ void TestRequantizeMany8To32Bit(float input_min, float input_max,
   }
 
   const Tensor i_tensor =
-      tensorflow::test::AsTensor(gtl::ArraySlice<quint8>(values_quantized));
+      tensorflow::test::AsTensor(absl::Span<const quint8>(values_quantized));
   Tensor o_tensor(DT_QINT32, TensorShape{values_count});
   auto output_values = o_tensor.flat<qint32>();
 
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 1b1d7f448ada46..f9f86e599a8b0f 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -78,7 +78,7 @@ Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
 }
 
 // static
-string QueueBase::ShapeListString(const gtl::ArraySlice<TensorShape>& shapes) {
+string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
   string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index 71884a9dff0af9..e47ddb4dd28894 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -135,7 +135,7 @@ class QueueBase : public QueueInterface {
   ~QueueBase() override;
 
   // Helpers for implementing MatchesNodeDef().
-  static string ShapeListString(const gtl::ArraySlice<TensorShape>& shapes);
+  static string ShapeListString(const absl::Span<const TensorShape>& shapes);
   Status MatchesNodeDefOp(const NodeDef& node_def, const string& op) const;
   Status MatchesNodeDefCapacity(const NodeDef& node_def,
                                 int32_t capacity) const;
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
index cfc3a0812ec4c3..999fd44d7c0e92 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -508,7 +508,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
     : public ::tensorflow::OpsTestBase {
  protected:
   std::unique_ptr<ShapeInferenceTestOp> op_;
-  void SetAttributes(const gtl::ArraySlice<string> row_partition_types,
+  void SetAttributes(const absl::Span<const string> row_partition_types,
                      int num_row_partition_tensors) {
     op_ = std::make_unique<ShapeInferenceTestOp>("RaggedTensorToTensor");
     SetAttrValue(row_partition_types,
@@ -519,7 +519,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
 };
 
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
-  SetAttributes(gtl::ArraySlice<string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
+  SetAttributes(absl::Span<const string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
 
   INFER_OK(*op_, "?;?;?;?;?", "?");
   INFER_OK(*op_, "?;[6];[];[];[6]", "[?,?]");
@@ -544,7 +544,7 @@ TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, RowSplits) {
   // RaggedTensorToTensor(param_splits+, param_values, indices) -> [splits+,
   // values]
-  SetAttributes(gtl::ArraySlice<string>{"ROW_SPLITS"}, 1);
+  SetAttributes(absl::Span<const string>{"ROW_SPLITS"}, 1);
 
   // value, default_value, ROW_SPLITS
   INFER_OK(*op_, "?;?;?;?", "?");
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index eae756b89896e5..449e0ccb879253 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -36,21 +36,18 @@ using gtl::MutableArraySlice;
 RangeSampler::~RangeSampler() {}
 
 void RangeSampler::SampleBatch(random::SimplePhilox* rnd, bool unique,
-                               gtl::MutableArraySlice<int64_t> batch) const {
-  SampleBatchGetExpectedCount(
-      rnd, unique, batch, gtl::MutableArraySlice<float>(),
-      gtl::ArraySlice<int64_t>(), gtl::MutableArraySlice<float>());
+                               absl::Span<int64_t> batch) const {
+  SampleBatchGetExpectedCount(rnd, unique, batch, absl::Span<float>(),
+                              absl::Span<const int64_t>(), absl::Span<float>());
 }
 
 void RangeSampler::SampleBatchGetExpectedCount(
-    random::SimplePhilox* rnd, bool unique,
-    gtl::MutableArraySlice<int64_t> batch,
-    gtl::MutableArraySlice<float> batch_expected_count,
-    gtl::ArraySlice<int64_t> extras,
-    gtl::MutableArraySlice<float> extras_expected_count) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count) const {
   SampleBatchGetExpectedCountAvoid(rnd, unique, batch, batch_expected_count,
                                    extras, extras_expected_count,
-                                   gtl::ArraySlice<int64_t>());
+                                   absl::Span<const int64_t>());
 }
 
 namespace {
@@ -77,10 +74,10 @@ static float ExpectedCountHelper(float p, int batch_size, int num_tries) {
 }  // namespace
 
 void RangeSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   const int batch_size = batch.size();
   int num_tries;
 
@@ -124,10 +121,10 @@ void RangeSampler::SampleBatchGetExpectedCountAvoid(
 AllSampler::AllSampler(int64_t range) : RangeSampler(range) {}
 
 void AllSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   const int batch_size = batch.size();
   CHECK_EQ(range_, batch_size);
   for (int i = 0; i < batch_size; i++) {
@@ -191,7 +188,7 @@ float ThreadUnsafeUnigramSampler::Probability(int64_t value) const {
   return static_cast<float>(picker_.get_weight(value)) / picker_.total_weight();
 }
 
-void ThreadUnsafeUnigramSampler::Update(ArraySlice<int64_t> values) {
+void ThreadUnsafeUnigramSampler::Update(absl::Span<const int64_t> values) {
   int num_updates = std::min(static_cast<int>(values.size()),
                              kint32max - picker_.total_weight());
   for (int i = 0; i < num_updates; i++) {
@@ -218,17 +215,17 @@ float UnigramSampler::Probability(int64_t value) const {
 
 // Overriding at a high level results in far fewer lock acquisitions.
 void UnigramSampler::SampleBatchGetExpectedCountAvoid(
-    random::SimplePhilox* rnd, bool unique, MutableArraySlice<int64_t> batch,
-    MutableArraySlice<float> batch_expected_count, ArraySlice<int64_t> extras,
-    MutableArraySlice<float> extras_expected_count,
-    ArraySlice<int64_t> avoided_values) const {
+    random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+    absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+    absl::Span<float> extras_expected_count,
+    absl::Span<const int64_t> avoided_values) const {
   tf_shared_lock lock(mu_);
   unsafe_sampler_.SampleBatchGetExpectedCountAvoid(
       rnd, unique, batch, batch_expected_count, extras, extras_expected_count,
       avoided_values);
 }
 
-void UnigramSampler::Update(ArraySlice<int64_t> values) {
+void UnigramSampler::Update(absl::Span<const int64_t> values) {
   mutex_lock lock(mu_);
   unsafe_sampler_.Update(values);
 }
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index 8710e10977300c..94a0801a43be25 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -52,7 +52,7 @@ class RangeSampler {
   // If unique=true, then we re-pick each element until we get a
   // value distinct from all previously picked values in the batch.
   void SampleBatch(random::SimplePhilox* rnd, bool unique,
-                   gtl::MutableArraySlice<int64_t> batch) const;
+                   absl::Span<int64_t> batch) const;
 
   // Fill "batch" with samples from the distribution, and report
   // "expected counts".
@@ -74,29 +74,25 @@ class RangeSampler {
   // "batch_expected_count" must have size equal to 0 or to the size of "batch".
   // "extras" and "extras_expected_count" must have equal size.
   void SampleBatchGetExpectedCount(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count) const;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count) const;
 
   // Same as SampleBatchGetExpectedCount (see above), but with avoided values.
   // We repick to avoid all of the values in "avoided_values".
   // "avoided_values" is only supported with unique=true.  If
   // unique=false, then avoided_values must be empty.
   virtual void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const;
 
   // Does this sampler need to be updated with values, e.g. UnigramSampler
   virtual bool NeedsUpdates() const { return false; }
 
   // Updates the underlying distribution
-  virtual void Update(gtl::ArraySlice<int64_t> values) {
+  virtual void Update(absl::Span<const int64_t> values) {
     LOG(FATAL) << "Update not supported for this sampler type.";
   }
 
@@ -126,12 +122,10 @@ class AllSampler : public RangeSampler {
   }
 
   void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const override;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
 };
 
 class UniformSampler : public RangeSampler {
@@ -173,7 +167,7 @@ class ThreadUnsafeUnigramSampler : public RangeSampler {
   float Probability(int64_t value) const override;
 
   bool NeedsUpdates() const override { return true; }
-  void Update(gtl::ArraySlice<int64_t> values) override;
+  void Update(absl::Span<const int64_t> values) override;
 
  private:
   random::WeightedPicker picker_;
@@ -191,15 +185,13 @@ class UnigramSampler : public RangeSampler {
 
   // Overriding at a high level results in far fewer lock acquisitions.
   void SampleBatchGetExpectedCountAvoid(
-      random::SimplePhilox* rnd, bool unique,
-      gtl::MutableArraySlice<int64_t> batch,
-      gtl::MutableArraySlice<float> batch_expected_count,
-      gtl::ArraySlice<int64_t> extras,
-      gtl::MutableArraySlice<float> extras_expected_count,
-      gtl::ArraySlice<int64_t> avoided_values) const override;
+      random::SimplePhilox* rnd, bool unique, absl::Span<int64_t> batch,
+      absl::Span<float> batch_expected_count, absl::Span<const int64_t> extras,
+      absl::Span<float> extras_expected_count,
+      absl::Span<const int64_t> avoided_values) const override;
 
   bool NeedsUpdates() const override { return true; }
-  void Update(gtl::ArraySlice<int64_t> values) override;
+  void Update(absl::Span<const int64_t> values) override;
 
  private:
   ThreadUnsafeUnigramSampler unsafe_sampler_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index aa1f398ed0cc01..32aed624a8926f 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -74,7 +74,7 @@ class RangeSamplerTest : public ::testing::Test {
       a[i] = i;
     }
     for (int64_t i = 1; i < 10; i++) {
-      sampler_->Update(ArraySlice<int64_t>(a + i, 10 - i));
+      sampler_->Update(absl::Span<const int64_t>(a + i, 10 - i));
     }
   }
   std::unique_ptr<RangeSampler> sampler_;
@@ -321,7 +321,7 @@ TEST_F(RangeSamplerTest, Unique) {
 
   // Sample one batch and get the expected counts of all values
   sampler_->SampleBatchGetExpectedCount(&rnd, true, absl::MakeSpan(batch),
-                                        MutableArraySlice<float>(), all_values,
+                                        absl::Span<float>(), all_values,
                                         absl::MakeSpan(expected));
   // Check that all elements are unique
   std::set<int64_t> s(batch.begin(), batch.end());
@@ -329,9 +329,9 @@ TEST_F(RangeSamplerTest, Unique) {
 
   for (int trial = 0; trial < num_batches; trial++) {
     std::vector<float> trial_expected(range);
-    sampler_->SampleBatchGetExpectedCount(
-        &rnd, true, absl::MakeSpan(batch), MutableArraySlice<float>(),
-        all_values, absl::MakeSpan(trial_expected));
+    sampler_->SampleBatchGetExpectedCount(&rnd, true, absl::MakeSpan(batch),
+                                          absl::Span<float>(), all_values,
+                                          absl::MakeSpan(trial_expected));
     for (int i = 0; i < range; i++) {
       EXPECT_NEAR(expected[i], trial_expected[i], expected[i] * 0.5);
     }
@@ -358,8 +358,8 @@ TEST_F(RangeSamplerTest, Avoid) {
 
   // We expect to pick all elements of [0, 100) except the avoided two.
   sampler_->SampleBatchGetExpectedCountAvoid(
-      &rnd, true, absl::MakeSpan(batch), MutableArraySlice<float>(),
-      ArraySlice<int64_t>(), MutableArraySlice<float>(), avoided);
+      &rnd, true, absl::MakeSpan(batch), absl::Span<float>(),
+      absl::Span<const int64_t>(), absl::Span<float>(), avoided);
 
   int sum = 0;
   for (auto val : batch) {
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index d43a4aabdd3981..6a5053460e4400 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -217,7 +217,7 @@ class ReverseOp : public OpKernel {
 
 template <typename Device, typename T, int NDIMS>
 void HandleReverseV2Case(OpKernelContext* context,
-                         const gtl::ArraySlice<bool> axes, Tensor* result) {
+                         const absl::Span<const bool> axes, Tensor* result) {
   const Tensor& input = context->input(0);
 
   // Use optimized reverse if possible.
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 45dd448d1e9ca1..cf6defabe68bc9 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -2,18 +2,18 @@
 #   OpKernels for RNN ops.
 
 load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gpu_library",
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
 load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
+    "//tensorflow:tensorflow.bzl",
+    "tf_gpu_library",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index ae9169df96ac3a..269f1011acc61a 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -124,9 +124,9 @@ namespace functor {
 //    back to the front
 template <typename T>
 void DoRoll(const OpKernelContext* context, const int64_t num_elements,
-            const int num_dims, const gtl::ArraySlice<int32> dim_size,
-            const T* input, T* output, const gtl::ArraySlice<int32> threshold,
-            const gtl::ArraySlice<int64_t> dim_range) {
+            const int num_dims, const absl::Span<const int32> dim_size,
+            const T* input, T* output, const absl::Span<const int32> threshold,
+            const absl::Span<const int64_t> dim_range) {
   auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range](
                   int64_t start, int64_t end) {
     // array of indices for each dimension
@@ -188,9 +188,9 @@ template <typename T>
 // Use memcpy to copy memory in groups when the data type supports memcpy
 void DoRollWithMemcpy(const OpKernelContext* context,
                       const int64_t num_elements, const int num_dims,
-                      const gtl::ArraySlice<int32> dim_size, const T* input,
-                      T* output, const gtl::ArraySlice<int32> threshold,
-                      const gtl::ArraySlice<int64_t> dim_range,
+                      const absl::Span<const int32> dim_size, const T* input,
+                      T* output, const absl::Span<const int32> threshold,
+                      const absl::Span<const int64_t> dim_range,
                       const int64_t isd) {
   auto work = [input, output, num_dims, &dim_size, &threshold, &dim_range, isd](
                   int64_t start, int64_t end) {
@@ -311,10 +311,11 @@ void DoRollWithMemcpy(const OpKernelContext* context,
 template <typename T>
 struct Roll<CPUDevice, T> {
   void operator()(const OpKernelContext* context, const int64_t num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64_t isd) {
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range,
+                  const int64_t isd) {
     if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
       // V2 copies memory in groups instead of element by element
       DoRollWithMemcpy<T>(context, num_elements, num_dims, dim_size, input,
diff --git a/tensorflow/core/kernels/roll_op.h b/tensorflow/core/kernels/roll_op.h
index 69fbe3c366b59a..7ae1d8f58bc57b 100644
--- a/tensorflow/core/kernels/roll_op.h
+++ b/tensorflow/core/kernels/roll_op.h
@@ -34,10 +34,10 @@ struct Roll {
   //    back to the front
   // isd - inner shift dimension
   void operator()(const OpKernelContext* context, const int64_t num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64_t isd);
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range, const int64_t isd);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 8e0a0965a010ab..8cda48097cf9b8 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -365,7 +365,7 @@ struct RestoreOp {
 Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
-                        gtl::ArraySlice<DataType> dtypes) {
+                        absl::Span<const DataType> dtypes) {
   const string& prefix_string = prefix.scalar<tstring>()();
 
   const auto& tensor_names_flat = tensor_names.flat<tstring>();
diff --git a/tensorflow/core/kernels/save_restore_tensor.h b/tensorflow/core/kernels/save_restore_tensor.h
index be7f4b889e78fd..6e58b90c6c4d87 100644
--- a/tensorflow/core/kernels/save_restore_tensor.h
+++ b/tensorflow/core/kernels/save_restore_tensor.h
@@ -66,7 +66,7 @@ void RestoreTensor(OpKernelContext* context,
 Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
-                        gtl::ArraySlice<DataType> dtypes);
+                        absl::Span<const DataType> dtypes);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index eb329b9b90b616..53329049936d66 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -283,8 +283,8 @@ class MergeV2Checkpoints : public OpKernel {
                     "Input destination_prefix should be a scalar tensor, got ",
                     destination_prefix.shape().DebugString(), " instead."));
 
-    const gtl::ArraySlice<tstring> input_prefixes =
-        gtl::ArraySlice<tstring>(checkpoint_prefixes.flat<tstring>());
+    const absl::Span<const tstring> input_prefixes =
+        absl::Span<const tstring>(checkpoint_prefixes.flat<tstring>());
     Env* env = Env::Default();
     const string& merged_prefix = destination_prefix.scalar<tstring>()();
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/scatter_nd_util.h b/tensorflow/core/kernels/scatter_nd_util.h
index ae78a6abad3a0d..f0530048ef699a 100644
--- a/tensorflow/core/kernels/scatter_nd_util.h
+++ b/tensorflow/core/kernels/scatter_nd_util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_SCATTER_ND_UTIL_H_
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 65eee00197e7f6..ec4571b91cea62 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 class ScopedAllocatorOpTest : public OpsTestBase {
  protected:
   void MakeOp(const TensorShape& shape,
-              const gtl::ArraySlice<TensorShape> shapes, DataType dtype,
+              const absl::Span<const TensorShape> shapes, DataType dtype,
               const string& name, int32_t id, int32_t expected_call_count) {
     TF_EXPECT_OK(NodeDefBuilder("scoped_allocator_op", "_ScopedAllocator")
                      .Attr("T", dtype)
diff --git a/tensorflow/core/kernels/sequence_ops_test.cc b/tensorflow/core/kernels/sequence_ops_test.cc
index 7a319d51ec9e64..1985d631d23739 100644
--- a/tensorflow/core/kernels/sequence_ops_test.cc
+++ b/tensorflow/core/kernels/sequence_ops_test.cc
@@ -96,7 +96,7 @@ TEST_F(RangeOpTest, Large_Double) {
   Tensor expected(allocator(), DT_DOUBLE, TensorShape({20000}));
   std::vector<double> result;
   for (int32_t i = 0; i < 20000; ++i) result.push_back(i * 0.5);
-  test::FillValues<double>(&expected, gtl::ArraySlice<double>(result));
+  test::FillValues<double>(&expected, absl::Span<const double>(result));
   test::ExpectTensorEqual<double>(expected, *GetOutput(0));
 }
 
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index e5c18e399518db..f3f73a4c718d71 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -216,8 +216,8 @@ class SliceOp : public OpKernel {
 
  private:
   template <int NDIM>
-  void HandleCase(OpKernelContext* context, gtl::ArraySlice<int64_t> begin,
-                  gtl::ArraySlice<int64_t> size, const Tensor& input,
+  void HandleCase(OpKernelContext* context, absl::Span<const int64_t> begin,
+                  absl::Span<const int64_t> size, const Tensor& input,
                   Tensor* result) {
     Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
     Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index d99e8b6f79b3b3..4df115b0f47e9b 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -1,5 +1,9 @@
 # Description: Op kernels for sparse matrix operations.
 
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
@@ -7,10 +11,6 @@ load(
 )
 load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/kernels/sparse_add_op_test.cc b/tensorflow/core/kernels/sparse_add_op_test.cc
index 3c4eea8b279948..e2d19a62769fe7 100644
--- a/tensorflow/core/kernels/sparse_add_op_test.cc
+++ b/tensorflow/core/kernels/sparse_add_op_test.cc
@@ -63,9 +63,9 @@ TEST_F(SparseAddOpTest, TwoD_AddSparseTensorWithSelf) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
 #define ADD_TENSOR_INPUT()                                  \
   AddInputFromArray<int64_t>(indices_shape, indices);       \
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index 6b57a460c2be65..4007c69b3bd403 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -107,7 +107,8 @@ class SparseDenseBinaryOpShared : public OpKernel {
 
     // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal
     // to dims in rhs (from right to left).
-    auto VecGreaterEq = [](ArraySlice<int64_t> lhs, ArraySlice<int64_t> rhs) {
+    auto VecGreaterEq = [](absl::Span<const int64_t> lhs,
+                           absl::Span<const int64_t> rhs) {
       if (lhs.size() < rhs.size()) return false;
       for (size_t i = 0; i < rhs.size(); ++i) {
         if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false;
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index 90d7a3c77b1c6d..27115f3153458f 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -99,9 +99,9 @@ TEST_F(SparseDenseCDivTest, SameShape) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   // Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   Tensor dense(DT_FLOAT, TensorShape(shape));
@@ -128,9 +128,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseSameDims) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({3, 1}));
   auto dense_flat = dense.flat<float>();
@@ -155,9 +155,9 @@ TEST_F(SparseDenseCDivTest, BroadcastDenseFewerDims) {
   // [3   4]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
@@ -187,9 +187,9 @@ TEST_F(SparseDenseCMulTest, BroadcastDense) {
   // [1.5 0]
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   Tensor dense(DT_FLOAT, TensorShape({2}));
   auto dense_flat = dense.flat<float>();
@@ -243,7 +243,7 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {
   Tensor vals(DT_FLOAT, TensorShape({total_nnz}));
   Tensor shape(DT_INT64, TensorShape({kNumDims}));
   vals.flat<float>().setRandom();
-  test::FillValues(&shape, gtl::ArraySlice<int64_t>({B, M, N}));
+  test::FillValues(&shape, absl::Span<const int64_t>({B, M, N}));
   auto indices_mat = indices.matrix<int64_t>();
 
   int nnz_cnt = 0;
diff --git a/tensorflow/core/kernels/sparse_reduce_op.cc b/tensorflow/core/kernels/sparse_reduce_op.cc
index d64d7829a65fc1..348a73e0816280 100644
--- a/tensorflow/core/kernels/sparse_reduce_op.cc
+++ b/tensorflow/core/kernels/sparse_reduce_op.cc
@@ -54,7 +54,7 @@ struct ReduceDetails {
 // }
 // // Set output shape to reduction.reduced_shape.
 ReduceDetails SparseTensorReduceHelper(const SparseTensor &sp,
-                                       gtl::ArraySlice<int32> axes_slice,
+                                       absl::Span<const int32> axes_slice,
                                        bool keep_dims) {
   ReduceDetails reduction;
 
@@ -206,8 +206,8 @@ class SparseReduceOp : public OpKernel {
       }
     }
 
-    auto CoordinatesToFlatIndex = [](ArraySlice<int64_t> coords,
-                                     ArraySlice<int64_t> strides) -> int64 {
+    auto CoordinatesToFlatIndex = [](absl::Span<const int64_t> coords,
+                                     absl::Span<const int64_t> strides) -> int64 {
       if (strides.empty()) {  // Reduce all.
         return 0;
       }
diff --git a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
index 4aa89a31962329..9ec7e18cf3315e 100644
--- a/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
+++ b/tensorflow/core/kernels/sparse_reduce_sum_op_test.cc
@@ -53,9 +53,9 @@ TEST_F(SparseReduceSumOpTest, SimpleReduce) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   AddInputFromArray<int64_t>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
@@ -95,9 +95,9 @@ TEST_F(SparseReduceSumSparseOpTest, SimpleReduce) {
 
   const auto indices_shape = TensorShape({4, 2});
   std::initializer_list<int64_t> in{0, 1, 1, 0, 2, 0, 2, 1};
-  const gtl::ArraySlice<int64_t> indices(in);
+  const absl::Span<const int64_t> indices(in);
   std::initializer_list<int64_t> sh{3, 2};
-  const gtl::ArraySlice<int64_t> shape(sh);
+  const absl::Span<const int64_t> shape(sh);
 
   AddInputFromArray<int64_t>(indices_shape, indices);
   AddInputFromArray<float>(TensorShape({4}), {2, 2, 3, 4});
diff --git a/tensorflow/core/kernels/sparse_reorder_op.cc b/tensorflow/core/kernels/sparse_reorder_op.cc
index 0caeaf0a0e0699..c3f1787f1f2130 100644
--- a/tensorflow/core/kernels/sparse_reorder_op.cc
+++ b/tensorflow/core/kernels/sparse_reorder_op.cc
@@ -45,8 +45,8 @@ template <typename T>
 struct SparseReorderFunctor<CPUDevice, T> {
   void operator()(OpKernelContext* context, const Tensor& input_ind,
                   const Tensor& input_val, const Tensor& input_shape_in) {
-    gtl::ArraySlice<int64_t> input_shape(input_shape_in.vec<int64_t>().data(),
-                                         input_shape_in.NumElements());
+    absl::Span<const int64_t> input_shape(input_shape_in.vec<int64_t>().data(),
+                                          input_shape_in.NumElements());
 
     gtl::InlinedVector<int64_t, 8> std_order(input_shape.size());
     std::iota(std_order.begin(), std_order.end(), 0);
diff --git a/tensorflow/core/kernels/sparse_slice_op.cc b/tensorflow/core/kernels/sparse_slice_op.cc
index de179d087654ad..fdaba288f20168 100644
--- a/tensorflow/core/kernels/sparse_slice_op.cc
+++ b/tensorflow/core/kernels/sparse_slice_op.cc
@@ -50,10 +50,10 @@ struct SparseSliceFunctor<CPUDevice, T> {
                                 input_indices, input_values,
                                 sparse_tensor_shape, &sparse_tensor));
 
-    const gtl::ArraySlice<int64_t> start(input_start.flat<int64_t>().data(),
+    const absl::Span<const int64_t> start(input_start.flat<int64_t>().data(),
+                                          input_dims);
+    const absl::Span<const int64_t> size(input_size.flat<int64_t>().data(),
                                          input_dims);
-    const gtl::ArraySlice<int64_t> size(input_size.flat<int64_t>().data(),
-                                        input_dims);
 
     const absl::StatusOr<sparse::SparseTensor> output_or =
         sparse::SparseTensor::Slice<T>(sparse_tensor, start, size);
diff --git a/tensorflow/core/kernels/sparse_softmax_op.cc b/tensorflow/core/kernels/sparse_softmax_op.cc
index 9a073a12570ff1..47006f056e0356 100644
--- a/tensorflow/core/kernels/sparse_softmax_op.cc
+++ b/tensorflow/core/kernels/sparse_softmax_op.cc
@@ -87,9 +87,10 @@ class SparseSoftmaxOp : public OpKernel {
     gtl::InlinedVector<int64_t, 4> dims(rank);
     std::iota(dims.begin(), dims.end(), 0);
     // { 0, ..., rank-1 }.
-    const ArraySlice<int64_t> kReorderDims(dims);
+    const absl::Span<const int64_t> kReorderDims(dims);
     // All but the last dim -- the class dimension to be max-reduced along.
-    const ArraySlice<int64_t> kGroupByDims = kReorderDims.subspan(0, rank - 1);
+    const absl::Span<const int64_t> kGroupByDims =
+        kReorderDims.subspan(0, rank - 1);
     st.Reorder<T>(kReorderDims);
     int count = 0;
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 23f3155392ef9a..5604f836c404b7 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -38,17 +38,17 @@ namespace tensorflow {
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceCase(OpKernelContext* context,
-                            const gtl::ArraySlice<int64_t>& begin,
-                            const gtl::ArraySlice<int64_t>& end,
-                            const gtl::ArraySlice<int64_t>& strides,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
                             const TensorShape& processing_shape,
                             bool is_simple_slice, Tensor* result);
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceGradCase(OpKernelContext* context,
-                                const gtl::ArraySlice<int64_t>& begin,
-                                const gtl::ArraySlice<int64_t>& end,
-                                const gtl::ArraySlice<int64_t>& strides,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
                                 const TensorShape& processing_shape,
                                 bool is_simple_slice, Tensor* result);
 
@@ -56,9 +56,9 @@ template <typename Device, typename T, int NDIM>
 class HandleStridedSliceAssignCase {
  public:
   void operator()(OpKernelContext* context,
-                  const gtl::ArraySlice<int64_t>& begin,
-                  const gtl::ArraySlice<int64_t>& end,
-                  const gtl::ArraySlice<int64_t>& strides,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
                   const StridedSliceAssignBCast& bcast, Tensor* result);
 };
 }  // namespace tensorflow
@@ -76,9 +76,9 @@ namespace tensorflow {
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceCase(OpKernelContext* context,
-                            const gtl::ArraySlice<int64_t>& begin,
-                            const gtl::ArraySlice<int64_t>& end,
-                            const gtl::ArraySlice<int64_t>& strides,
+                            const absl::Span<const int64_t>& begin,
+                            const absl::Span<const int64_t>& end,
+                            const absl::Span<const int64_t>& strides,
                             const TensorShape& processing_shape,
                             bool is_simple_slice, Tensor* result) {
   typedef typename proxy_type<Device, T>::type Proxy;
@@ -114,9 +114,9 @@ void HandleStridedSliceCase(OpKernelContext* context,
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceGradCase(OpKernelContext* context,
-                                const gtl::ArraySlice<int64_t>& begin,
-                                const gtl::ArraySlice<int64_t>& end,
-                                const gtl::ArraySlice<int64_t>& strides,
+                                const absl::Span<const int64_t>& begin,
+                                const absl::Span<const int64_t>& end,
+                                const absl::Span<const int64_t>& strides,
                                 const TensorShape& processing_shape,
                                 bool is_simple_slice, Tensor* result) {
   gtl::InlinedVector<int64_t, 4> processing_dims = processing_shape.dim_sizes();
@@ -139,9 +139,9 @@ void HandleStridedSliceGradCase(OpKernelContext* context,
 
 template <typename Device, typename T, int NDIM>
 void HandleStridedSliceAssignCase<Device, T, NDIM>::operator()(
-    OpKernelContext* context, const gtl::ArraySlice<int64_t>& begin,
-    const gtl::ArraySlice<int64_t>& end,
-    const gtl::ArraySlice<int64_t>& strides,
+    OpKernelContext* context, const absl::Span<const int64_t>& begin,
+    const absl::Span<const int64_t>& end,
+    const absl::Span<const int64_t>& strides,
     const StridedSliceAssignBCast& bcast, Tensor* result) {
   typedef typename proxy_type<Device, T>::type Proxy;
   Eigen::DSizes<Eigen::DenseIndex, NDIM> begin_di;
@@ -166,9 +166,9 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
  public:
   enum { NDIM_PROXY = 1 };
   void operator()(OpKernelContext* context,
-                  const gtl::ArraySlice<int64_t>& begin,
-                  const gtl::ArraySlice<int64_t>& end,
-                  const gtl::ArraySlice<int64_t>& strides,
+                  const absl::Span<const int64_t>& begin,
+                  const absl::Span<const int64_t>& end,
+                  const absl::Span<const int64_t>& strides,
                   const StridedSliceAssignBCast& bcast, Tensor* result) {
     gtl::InlinedVector<int64_t, 1> processing_dims(1);
     processing_dims[0] = 1;
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 5d1322e6f4b7d6..7ccd9b5afe12a6 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -376,7 +376,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
           dims.push_back(dim.size);
         }
         TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
-            gtl::ArraySlice<int64_t>(dims), &element_shape));
+            absl::Span<const int64_t>(dims), &element_shape));
       }
     } else {
       element_shape = tensor_array->ElemShape();
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index d640d051a40f4d..1dab98fb968d6d 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -34,7 +34,7 @@ namespace tensorflow {
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 Status DoTranspose(const Device& device, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, Tensor* out);
+                   const absl::Span<const int32> perm, Tensor* out);
 
 // Conjugate and transpose tensor 'in' into tensor 'out' according to dimension
 // permutation 'perm'.
@@ -45,7 +45,7 @@ Status DoTranspose(const Device& device, const Tensor& in,
 // REQUIRES: in.dim_size(perm[i]) == out->dim_size(i)
 template <typename Device>
 Status DoConjugateTranspose(const Device& device, const Tensor& in,
-                            const gtl::ArraySlice<int32> perm, Tensor* out);
+                            const absl::Span<const int32> perm, Tensor* out);
 
 // Convenience versions of DoTranspose that only swap the last (inner) two
 // dimensions.
@@ -62,7 +62,7 @@ Status DoConjugateMatrixTranspose(const Device& device, const Tensor& in,
 template <typename Device, typename T, bool conjugate = false>
 struct Transpose {
   static void run(const Device& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out);
+                  const absl::Span<const int32> perm, Tensor* out);
 };
 
 // Implementation details.
@@ -77,7 +77,7 @@ typedef gtl::InlinedVector<int32, 8> TransposePermsVec;
 // Example: Tensor shape {2, 3, 4, 5, 120} and permutation {0, 4, 1, 2, 3} will
 // produce new shape {2, 60, 120} and new permutation {0, 2, 1}.
 inline void ReduceTransposeDimensions(const TensorShape& shape,
-                                      gtl::ArraySlice<int32> perm,
+                                      absl::Span<const int32> perm,
                                       TransposePermsVec* new_perm,
                                       TransposeDimsVec* new_dims) {
   CHECK_EQ(shape.dims(), perm.size());
@@ -146,7 +146,7 @@ inline bool NonSingletonDimensionsAlign(const TensorShape& input_shape,
 // Uses Eigen to transpose.
 template <typename Device, typename T, int NDIMS>
 void TransposeUsingEigen(const Device& d, const Tensor& in,
-                         const gtl::ArraySlice<int32> perm, bool conjugate,
+                         const absl::Span<const int32> perm, bool conjugate,
                          Tensor* out) {
   Eigen::array<int, NDIMS> p;
   for (int i = 0; i < NDIMS; ++i) p[i] = perm[i];
@@ -165,7 +165,7 @@ void TransposeUsingEigen(const Device& d, const Tensor& in,
 
 template <typename Device>
 Status DoTransposeImpl(const Device& d, const Tensor& in,
-                       const gtl::ArraySlice<int32> perm, bool conjugate,
+                       const absl::Span<const int32> perm, bool conjugate,
                        Tensor* out) {
   CHECK_EQ(in.dims(), out->dims());
   CHECK_EQ(in.dims(), perm.size());
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 4a3e744b4ada36..084721f6290d07 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -33,7 +33,7 @@ namespace {
 
 template <typename T, bool conjugate>
 void TransposeSimple(const CPUDevice& device, const Tensor& in,
-                     const gtl::ArraySlice<int32> perm, Tensor* out) {
+                     const absl::Span<const int32> perm, Tensor* out) {
   const int ndims = in.dims();
   gtl::InlinedVector<int64_t, 8> in_strides =
       ComputeStride<int64_t>(in.shape());
@@ -73,7 +73,7 @@ void TransposeSimple(const CPUDevice& device, const Tensor& in,
 template <typename T, bool conjugate>
 struct Transpose<CPUDevice, T, conjugate> {
   static void run(const CPUDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
+                  const absl::Span<const int32> perm, Tensor* out) {
     switch (in.dims()) {
       case 2:
         internal::TransposeUsingEigen<CPUDevice, T, 2>(d, in, perm, conjugate,
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index e3719aab6c648e..a28b32e91c0187 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -188,7 +188,7 @@ void TransposeOp::Compute(OpKernelContext* ctx) {
 }
 
 Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                   gtl::ArraySlice<int32> perm, Tensor* out) {
+                                   absl::Span<const int32> perm, Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoTranspose(ctx->eigen_device<CPUDevice>(), in, perm,
                                    out);
@@ -196,7 +196,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 
 Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
                                             const Tensor& in,
-                                            gtl::ArraySlice<int32> perm,
+                                            absl::Span<const int32> perm,
                                             Tensor* out) {
   typedef Eigen::ThreadPoolDevice CPUDevice;
   return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<CPUDevice>(), in,
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 840526468e96f6..f758c6ae66471f 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -29,7 +29,7 @@ class TransposeOp : public OpKernel {
 
  protected:
   virtual Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                             gtl::ArraySlice<int32> perm, Tensor* out) = 0;
+                             absl::Span<const int32> perm, Tensor* out) = 0;
   virtual bool IsConjugate() const { return false; }
 };
 
@@ -39,7 +39,7 @@ class TransposeCpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
 };
 
 #if defined(INTEL_MKL)
@@ -59,7 +59,7 @@ class TransposeGpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
 };
 
 
@@ -71,7 +71,7 @@ class ConjugateTransposeCpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
@@ -95,7 +95,7 @@ class ConjugateTransposeGpuOp : public TransposeOp {
 
  protected:
   Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
+                     absl::Span<const int32> perm, Tensor* out) override;
   bool IsConjugate() const override { return true; }
 };
 
diff --git a/tensorflow/core/kernels/transpose_util_test.cc b/tensorflow/core/kernels/transpose_util_test.cc
index a6e8d0898c1f09..caba50c268dd8d 100644
--- a/tensorflow/core/kernels/transpose_util_test.cc
+++ b/tensorflow/core/kernels/transpose_util_test.cc
@@ -23,15 +23,15 @@ namespace tensorflow {
 class TransposeUtilTest : public ::testing::Test {
  protected:
   void TestDimensionReduction(const TensorShape& shape,
-                              const gtl::ArraySlice<int32> perm,
-                              const gtl::ArraySlice<int32> expected_perm,
-                              const gtl::ArraySlice<int64_t> expected_dims) {
+                              const absl::Span<const int32> perm,
+                              const absl::Span<const int32> expected_perm,
+                              const absl::Span<const int64_t> expected_dims) {
     internal::TransposePermsVec new_perm;
     internal::TransposeDimsVec new_dims;
     internal::ReduceTransposeDimensions(shape, perm, &new_perm, &new_dims);
 
-    gtl::ArraySlice<int32> computed_perm(new_perm);
-    gtl::ArraySlice<int64_t> computed_dims(new_dims);
+    absl::Span<const int32> computed_perm(new_perm);
+    absl::Span<const int64_t> computed_dims(new_dims);
     EXPECT_EQ(computed_perm, expected_perm);
     EXPECT_EQ(computed_dims, expected_dims);
   }
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index 3d59cc034480b3..c18ca23791405e 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -532,6 +532,10 @@ class UnicodeEncodeOp : public OpKernel {
     const Tensor& input_splits = context->input(1);
     const auto input_splits_flat = input_splits.flat<SPLITS_TYPE>();
 
+    OP_REQUIRES(
+        context, input_tensor.dims() == 1 && input_splits.dims() == 1,
+        absl::InvalidArgumentError(
+            "Both the input_tensor and input_splits should be of rank 1. "));
     OP_REQUIRES(
         context, input_splits.NumElements() > 0,
         errors::InvalidArgument("Input_splits should contain elements, but "
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index d03c47b594f637..868d05f0912fc8 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -31,7 +31,8 @@ cc_library(
     name = "array_slice",
     hdrs = ["array_slice.h"],
     deps = [
-        "//tensorflow/core/lib/gtl:inlined_vector",
+        ":inlined_vector",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/core/lib/gtl/array_slice.h b/tensorflow/core/lib/gtl/array_slice.h
index 8f47faf89e4d01..ddacf4d2e3d5c3 100644
--- a/tensorflow/core/lib/gtl/array_slice.h
+++ b/tensorflow/core/lib/gtl/array_slice.h
@@ -16,19 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
 #define TENSORFLOW_CORE_LIB_GTL_ARRAY_SLICE_H_
 
+#include "absl/base/macros.h"
 #include "absl/types/span.h"
 // TODO(timshen): This is kept only because lots of targets transitively depend
 // on it. Remove all targets' dependencies.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace gtl {
 
 template <typename T>
-using ArraySlice = absl::Span<const T>;
+using ArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<const T>;
 
 template <typename T>
-using MutableArraySlice = absl::Span<T>;
+using MutableArraySlice ABSL_DEPRECATE_AND_INLINE() = absl::Span<T>;
 
 }  // namespace gtl
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index 8afce9385a10b7..f2d173ab3e82dd 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -43,7 +43,7 @@ limitations under the License.
 namespace tensorflow {
 namespace png {
 
-// Handy container for decoding informations and struct pointers
+// Handy container for decoding information and struct pointers
 struct DecodeContext {
   const uint8* data;
   int data_left;
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index cbb6fc6dd2bba0..db2c962671c3f0 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -40,9 +40,7 @@ cc_library(
     name = "random_distributions_utils",
     hdrs = ["random_distributions_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":philox_random",
         "@local_tsl//tsl/lib/random:random_distributions_utils",
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 37cbb068dece35..0cf29f220066a1 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -3,6 +3,7 @@
 #   APIs are meant to change over time.
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "if_nccl", "tf_copts")
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test")
 load(
@@ -10,7 +11,6 @@ load(
     "tf_cuda_tests_tags",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
index 548e0444bf2ce0..c1e0497969e7fc 100644
--- a/tensorflow/core/ops/BUILD
+++ b/tensorflow/core/ops/BUILD
@@ -1,6 +1,10 @@
 # Description:
 # Tensorflow default op definitions.
 
+load(
+    "@local_xla//xla/tsl/mkl:build_defs.bzl",
+    "if_mkl",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -10,10 +14,6 @@ load(
     "filegroup",
     "tf_gen_op_libs",
 )
-load(
-    "@local_xla//xla/tsl/mkl:build_defs.bzl",
-    "if_mkl",
-)
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
@@ -84,7 +84,6 @@ tf_gen_op_libs(
         "set_ops",
         "script_ops",
         "sendrecv_ops",
-        "sparse_csr_matrix_ops",
         "sparse_ops",
         "spectral_ops",
         "state_ops",
@@ -102,6 +101,18 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = ["sparse_csr_matrix_ops"],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_gen_op_libs(
     is_external = False,
     op_lib_names = [
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index c9f574981969fe..db7225099edf8d 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -2,12 +2,12 @@
 # For keeping the history of OpDefs for every major version of TensorFlow,
 # to validate that we don't make backwards-incompatible changes.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
new file mode 100644
index 00000000000000..2493251c1fddc6
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataSizeV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ComputeDedupDataSizeV2"
+  output_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
new file mode 100644
index 00000000000000..87deca2a2daecd
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ComputeDedupDataTupleMaskV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "ComputeDedupDataTupleMaskV2"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
new file mode 100644
index 00000000000000..63c69eaff3aaba
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeTPUEmbeddingV2.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "FinalizeTPUEmbeddingV2"
+  input_arg {
+    name: "common_config"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "memory_config"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "embedding_partitions"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "hbm_buffers_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
new file mode 100644
index 00000000000000..2e8fb4d4f2530c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingActivationsV2.pbtxt
@@ -0,0 +1,35 @@
+op {
+  name: "XlaRecvTPUEmbeddingActivationsV2"
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_tables"
+  }
+  attr {
+    name: "num_tables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
new file mode 100644
index 00000000000000..d97710b91e46fb
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaRecvTPUEmbeddingDeduplicationDataV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
new file mode 100644
index 00000000000000..b416d0ad1a8f0c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/XlaSendTPUEmbeddingGradientsV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "XlaSendTPUEmbeddingGradientsV2"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+    number_attr: "NumTables"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NumLearningRateTags"
+  }
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "NumTables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NumLearningRateTags"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index d1824cf1ef431d..b1d65fa9a2972e 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -2019,7 +2019,7 @@ REGISTER_OP("DenseBincount")
         return errors::InvalidArgument("size (", size_val,
                                        ") must be non-negative");
       }
-      if (c->Rank(c->input(0)) == 1) {
+      if (c->Rank(c->input(0)) == 1 || c->Rank(c->input(0)) == 0) {
         c->set_output(0, c->MakeShape({size_val}));
       } else if (c->Rank(c->input(0)) == 2) {
         c->set_output(0, c->MakeShape({c->Dim(c->input(0), 0), size_val}));
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 63973c87f531f9..81347a6dea8801 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9384,6 +9384,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ComputeDedupDataSizeV2"
+  output_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ComputeDedupDataTupleMask"
   output_arg {
@@ -9396,6 +9420,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ComputeDedupDataTupleMaskV2"
+  output_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "Concat"
   input_arg {
@@ -19935,6 +19983,26 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FinalizeTPUEmbeddingV2"
+  input_arg {
+    name: "common_config"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "memory_config"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "embedding_partitions"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "hbm_buffers_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "Fingerprint"
   input_arg {
@@ -66309,6 +66377,41 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaRecvTPUEmbeddingActivationsV2"
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_tables"
+  }
+  attr {
+    name: "num_tables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaRecvTPUEmbeddingDeduplicationData"
   output_arg {
@@ -66321,6 +66424,30 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaRecvTPUEmbeddingDeduplicationDataV2"
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaSendTPUEmbeddingGradients"
   input_arg {
@@ -66357,6 +66484,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "XlaSendTPUEmbeddingGradientsV2"
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+    number_attr: "NumTables"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NumLearningRateTags"
+  }
+  input_arg {
+    name: "deduplication_data"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "NumTables"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NumLearningRateTags"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  attr {
+    name: "embedding_partitions"
+    type: "string"
+  }
+  attr {
+    name: "hbm_buffers_config"
+    type: "string"
+  }
+  attr {
+    name: "tpu_topology"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "XlaSendToHost"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_csr_matrix_ops.cc b/tensorflow/core/ops/sparse_csr_matrix_ops.cc
index 25b39fa079093f..0a80e10f6a5313 100644
--- a/tensorflow/core/ops/sparse_csr_matrix_ops.cc
+++ b/tensorflow/core/ops/sparse_csr_matrix_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 
@@ -295,7 +299,7 @@ REGISTER_OP("SparseMatrixMatMul")
       return absl::OkStatus();
     });
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
 REGISTER_OP("_MklNativeSparseMatrixMatMul")
     .Input("a: variant")
@@ -373,7 +377,7 @@ REGISTER_OP("_MklNativeSparseMatrixMatMul")
       c->set_output(0, out);
       return OkStatus();
     });
-#endif
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 REGISTER_OP("SparseMatrixMul")
     .Input("a: variant")
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 9af3f143790b36..1e1e02fd64cf27 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -8,6 +8,15 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to any TensorFlow code outside this package.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow:strict.default.bzl", "py_strict_test")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -45,15 +54,6 @@ load(
     "cc_binary",
     "cc_library",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index 3cd84637725d95..de0453b6deac98 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -1,12 +1,5 @@
 """Provides a redirection point for platform specific implementations of starlark utilities."""
 
-load(
-    "//tensorflow/core/platform:build_config.default.bzl",
-    _tf_additional_binary_deps = "tf_additional_binary_deps",
-    _tf_dtensor_tpu_dependencies = "tf_dtensor_tpu_dependencies",
-    _tf_protos_all = "tf_protos_all",
-    _tf_tpu_dependencies = "tf_tpu_dependencies",
-)
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     _pyx_library = "pyx_library",
@@ -42,6 +35,13 @@ load(
     _tf_stream_executor_deps = "tf_stream_executor_deps",
     _tf_windows_aware_platform_deps = "tf_windows_aware_platform_deps",
 )
+load(
+    "//tensorflow/core/platform:build_config.default.bzl",
+    _tf_additional_binary_deps = "tf_additional_binary_deps",
+    _tf_dtensor_tpu_dependencies = "tf_dtensor_tpu_dependencies",
+    _tf_protos_all = "tf_protos_all",
+    _tf_tpu_dependencies = "tf_tpu_dependencies",
+)
 
 pyx_library = _pyx_library
 tf_additional_all_protos = _tf_additional_all_protos
diff --git a/tensorflow/core/platform/build_config.default.bzl b/tensorflow/core/platform/build_config.default.bzl
index bb8c0f2982cb62..f3c7eba3fd03b6 100644
--- a/tensorflow/core/platform/build_config.default.bzl
+++ b/tensorflow/core/platform/build_config.default.bzl
@@ -1,6 +1,10 @@
 """OSS versions of Bazel macros that can't be migrated to TSL."""
 
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_libtpu",
+)
 load(
     "@local_xla//xla/tsl/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -9,10 +13,6 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_libtpu",
-)
 
 def tf_tpu_dependencies():
     return if_libtpu([Label("//tensorflow/core/tpu/kernels")])
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index e499b3aa49328f..c0513041898983 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -1,10 +1,5 @@
 """Provides a redirection point for platform specific implementations of starlark utilities."""
 
-load(
-    "//tensorflow/core/platform:build_config_root.default.bzl",
-    _if_dynamic_kernels = "if_dynamic_kernels",
-    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
-)
 load(
     "@local_tsl//tsl/platform/default:build_config_root.bzl",
     _if_llvm_aarch32_available = "if_llvm_aarch32_available",
@@ -24,6 +19,11 @@ load(
     _tf_exec_properties = "tf_exec_properties",
     _tf_gpu_tests_tags = "tf_gpu_tests_tags",
 )
+load(
+    "//tensorflow/core/platform:build_config_root.default.bzl",
+    _if_dynamic_kernels = "if_dynamic_kernels",
+    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
+)
 
 if_llvm_aarch32_available = _if_llvm_aarch32_available
 if_llvm_aarch64_available = _if_llvm_aarch64_available
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 6fda33d324f8dc..c1d6061917ceaa 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Cloud file system implementation.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "tsl_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/platform/distribute.bzl b/tensorflow/core/platform/distribute.bzl
index edb55b5627e312..e05b3e06a79758 100644
--- a/tensorflow/core/platform/distribute.bzl
+++ b/tensorflow/core/platform/distribute.bzl
@@ -1,7 +1,7 @@
 """Build rules for tf.distribute testing."""
 
-load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 
 def distribute_py_test(
         name,
diff --git a/tensorflow/core/platform/profile_utils/BUILD b/tensorflow/core/platform/profile_utils/BUILD
index 67d4177bd9c67b..4af8510840ef63 100644
--- a/tensorflow/core/platform/profile_utils/BUILD
+++ b/tensorflow/core/platform/profile_utils/BUILD
@@ -1,15 +1,15 @@
 # Description:
 # profile_utils targets.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",  # @unused
+)
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_copts",  # @unused
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 28cab9a90641cd..e83d4cae4c0501 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 24fb84e9ae9549..40d967302cc34c 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -242,7 +242,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/utils:format_utils",
         "@local_tsl//tsl/profiler/utils:tf_op_utils",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
@@ -755,6 +755,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -816,6 +817,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/convert:xla_op_utils",
     ],
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
index 6f63827b857d4b..d42235f4b723f8 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_memory_visualization_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -216,6 +217,10 @@ class HloProtoBufferWrapper {
 
   const BufferAllocationStruct& GetBufferAllocation(
       int64_t buffer_allocation_id) const {
+    if (!id_to_buffer_allocation_.contains(buffer_allocation_id)) {
+      LOG(DFATAL) << "buffer_allocation_id " << buffer_allocation_id
+                  << " not found.";
+    }
     return *id_to_buffer_allocation_.at(buffer_allocation_id);
   }
 
@@ -230,6 +235,9 @@ class HloProtoBufferWrapper {
   }
 
   LogicalBufferStruct& GetLogicalBuffer(int64_t logical_buffer_id) const {
+    if (!id_to_logical_buffer_.contains(logical_buffer_id)) {
+      LOG(DFATAL) << "logical_buffer_id " << logical_buffer_id << "not found.";
+    }
     return *id_to_logical_buffer_.at(logical_buffer_id);
   }
 
@@ -294,9 +302,17 @@ class HloProtoBufferWrapper {
           std::make_unique<BufferAllocationStruct>(buffer_allocation);
       for (const auto& assigned : buffer_allocation.assigned()) {
         const auto id = assigned.logical_buffer_id();
+        if (!id_to_logical_buffer_proto.contains(id)) {
+          LOG(DFATAL) << "logical_buffer_id " << id << " not found.";
+          continue;
+        }
         const auto* logical_buffer = id_to_logical_buffer_proto.at(id);
-        const auto* instruction =
-            unique_id_to_hlo.at(logical_buffer->defined_at().instruction_id());
+        int64_t inst_id = logical_buffer->defined_at().instruction_id();
+        if (!unique_id_to_hlo.contains(inst_id)) {
+          LOG(DFATAL) << "instruction_id " << inst_id << " not found.";
+          continue;
+        }
+        const auto* instruction = unique_id_to_hlo.at(inst_id);
         id_to_logical_buffer_[id] = std::make_unique<LogicalBufferStruct>(
             *logical_buffer, *buffer_allocation_s, *instruction,
             assigned.offset());
@@ -328,6 +344,10 @@ class HloProtoBufferWrapper {
           hlo_proto_.buffer_assignment().heap_simulator_traces(i);
       int64_t event_count = 0;
       for (const auto& event : heap_simulator_trace.events()) {
+        if (!id_to_logical_buffer_.contains(event.buffer_id())) {
+          LOG(DFATAL) << "buffer_id " << event.buffer_id() << "not found.";
+          continue;
+        }
         const auto& logical_buffer =
             id_to_logical_buffer_.at(event.buffer_id());
         if (logical_buffer->color() == memory_color) {
@@ -895,6 +915,11 @@ void ConvertAllocationTimeline(const HloProtoBufferWrapper& wrapper,
     if (!heap_simulator_trace_id) continue;
     buffer_allocation_offsets.push_back(total_y_size);
     total_y_size += buffer_allocation->size();
+    if (*heap_simulator_trace_id >= heap_simulator_traces.size()) {
+      LOG(DFATAL) << "heap_simulator_trace_id " << *heap_simulator_trace_id
+                  << " out of bounds.";
+      continue;
+    }
     total_x_size = std::max<size_t>(
         total_x_size,
         heap_simulator_traces.at(*heap_simulator_trace_id).events_size());
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index c6d45f70df29c7..4ee010aa50eb99 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -24,6 +24,7 @@ limitations under the License.
 // #include "perftools/accelerators/xprof/convert/device_type_utils.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/log/log.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/gtl/top_n.h"
@@ -369,7 +370,10 @@ OpProfileBuilder::OpProfileBuilder(
     tensorflow::profiler::op_profile::Node* root,
     const tensorflow::protobuf::Map<uint64_t, std::string>* program_name_map)
     : options_(options), root_(root), program_name_map_(program_name_map) {
-  CHECK(root != nullptr);
+  if (root == nullptr) {
+    LOG(DFATAL) << "root is null.";
+    return;
+  }
   DCHECK(!options_.group_by_program || program_name_map_ != nullptr);
   root->set_name(options_.group_by_program ? "by_program" : "by_category");
 }
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 120e22bab5d64d..268908b3f1cf28 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,7 +47,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tsl/profiler/utils/format_utils.h"
 #include "tsl/profiler/utils/tf_op_utils.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 37017a70f7c472..621fe05afc9aa5 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -193,12 +193,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
     OverviewTfOp* op = analysis.add_top_device_ops();
     op->set_name(metrics->name());
     op->set_category(metrics->category());
-    op->set_self_time_fraction(
-        SafeDivide(metrics->self_time_ps(), total_device_time_ps));
+    op->set_self_time_fraction(tsl::profiler::SafeDivide(
+        metrics->self_time_ps(), total_device_time_ps));
     device_cumulative_fraction += op->self_time_fraction();
     op->set_cumulative_time_fraction(device_cumulative_fraction);
-    op->set_flop_rate(
-        SafeDivide(metrics->flops(), PicoToNano(metrics->time_ps())));
+    op->set_flop_rate(tsl::profiler::SafeDivide(
+        metrics->flops(), PicoToNano(metrics->time_ps())));
     auto iter = kernel_stats_by_op_name.find(op->name());
     if (iter != kernel_stats_by_op_name.end()) {
       op->set_is_op_tensorcore_eligible(
@@ -211,12 +211,12 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
       op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps();
   analysis.set_device_compute_16bit_percent(
       100.0 *
-      SafeDivide(
+      tsl::profiler::SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_16bit_ps(),
           total_device_compute_ps));
   analysis.set_device_compute_32bit_percent(
       100.0 *
-      SafeDivide(
+      tsl::profiler::SafeDivide(
           op_stats.device_op_metrics_db().precision_stats().compute_32bit_ps(),
           total_device_compute_ps));
 
@@ -256,19 +256,19 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
   analysis.set_host_tf_op_percent(
-      100.0 * SafeDivide(num_host_tf_ops, num_total_tf_ops));
+      100.0 * tsl::profiler::SafeDivide(num_host_tf_ops, num_total_tf_ops));
   analysis.set_device_tf_op_percent(
-      100.0 * SafeDivide(num_device_tf_ops, num_total_tf_ops));
+      100.0 * tsl::profiler::SafeDivide(num_device_tf_ops, num_total_tf_ops));
   analysis.set_host_trace_level(op_stats.run_environment().host_trace_level());
   analysis.set_host_op_time_eager_percent(
-      100.0 *
-      SafeDivide(eager_host_op_time_ps, total_host_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(eager_host_op_time_ps,
+                                        total_host_op_time_ps_exclude_idle));
   analysis.set_device_op_time_eager_percent(
-      100.0 * SafeDivide(eager_device_op_time_ps,
-                         total_device_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(eager_device_op_time_ps,
+                                        total_device_op_time_ps_exclude_idle));
   analysis.set_device_op_time_outside_compilation_percent(
-      100.0 * SafeDivide(outside_compilation_device_op_time_ps,
-                         total_device_op_time_ps_exclude_idle));
+      100.0 * tsl::profiler::SafeDivide(outside_compilation_device_op_time_ps,
+                                        total_device_op_time_ps_exclude_idle));
   return analysis;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index bb5d6977045516..3f9cbb9510ea60 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -70,8 +70,8 @@ TfStatsTable GenerateTfStatsTable(
     auto iter = kernel_stats_by_op_name.find(record->op_name());
     if (iter != kernel_stats_by_op_name.end()) {
       record->set_gpu_tensorcore_utilization(
-          SafeDivide(iter->second.tensor_core_duration_ns,
-                     iter->second.total_duration_ns));
+          tsl::profiler::SafeDivide(iter->second.tensor_core_duration_ns,
+                                    iter->second.total_duration_ns));
     } else {
       record->set_gpu_tensorcore_utilization(0.0);
     }
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 31bba4a868f4e0..a6d8d63369f006 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -139,15 +139,16 @@ occ_pct:100)MULTI";
   EXPECT_EQ(kTfOp1, record_0.op_name());
   EXPECT_EQ(kTfOp1, record_0.op_type());
   EXPECT_EQ(2, record_0.occurrences());
-  EXPECT_EQ(
-      NanoToMicro(kKernel1DurationNs) * 2 + NanoToMicro(kKernel2DurationNs) * 2,
-      record_0.total_self_time_in_us());
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel1DurationNs) * 2 +
+                tsl::profiler::NanoToMicro(kKernel2DurationNs) * 2,
+            record_0.total_self_time_in_us());
 
   const TfStatsRecord& record_1 = tf_stats.with_idle().tf_stats_record(1);
   EXPECT_EQ(kTfOp3, record_1.op_name());
   EXPECT_EQ(kTfOp3, record_1.op_type());
   EXPECT_EQ(1, record_1.occurrences());
-  EXPECT_EQ(NanoToMicro(kKernel4DurationNs) + NanoToMicro(kKernel5DurationNs),
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel4DurationNs) +
+                tsl::profiler::NanoToMicro(kKernel5DurationNs),
             record_1.total_self_time_in_us());
   // GPU TensorCore utilization is 0.5 because kernel4 is using TensorCore and
   // kernel5 is not using TensorCore, and they have the same duration.
@@ -157,7 +158,8 @@ occ_pct:100)MULTI";
   EXPECT_EQ(kTfOp2, record_2.op_name());
   EXPECT_EQ(kTfOp2, record_2.op_type());
   EXPECT_EQ(1, record_2.occurrences());
-  EXPECT_EQ(NanoToMicro(kKernel3DurationNs), record_2.total_self_time_in_us());
+  EXPECT_EQ(tsl::profiler::NanoToMicro(kKernel3DurationNs),
+            record_2.total_self_time_in_us());
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
index 2e52a6e590cd11..fcf0011409d681 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -181,7 +181,7 @@ class JsonEventWriter {
         if (event.has_flow_category()) {
           ContextType type = GetSafeContextType(event.flow_category());
           if (type != ContextType::kGeneric && type != ContextType::kLegacy) {
-            const char* category = GetContextTypeString(type);
+            const char* category = tsl::profiler::GetContextTypeString(type);
             output_->Append(R"(,"cat":")", category, R"(")");
           }
         }
@@ -210,7 +210,7 @@ class JsonEventWriter {
         output_->Append(R"(,"id":)", event.flow_id());
         if (event.has_flow_category()) {
           ContextType type = GetSafeContextType(event.flow_category());
-          const char* category = GetContextTypeString(type);
+          const char* category = tsl::profiler::GetContextTypeString(type);
           output_->Append(R"(,"cat":")", category, R"(")");
         }
         switch (event.flow_entry_type()) {
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 7af5f06e6169f2..e9da7fa9f4f154 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index d8fe28c8b96e17..7dd7f547b09c82 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 968397cfb12b2f..d8bb6c98e1fda1 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_session",
     ] + if_not_android([
         ":profiler_interface",
@@ -66,6 +67,7 @@ cc_library(
     ],
     deps = [
         ":profiler_interface",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
     ] + if_static([
@@ -83,9 +85,8 @@ cc_library(
         "//tensorflow/core/profiler:__pkg__",
     ],
     deps = [
-        ":profiler_controller",
         ":profiler_interface",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_factory_impl",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
@@ -104,6 +105,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
     ],
 )
@@ -115,6 +117,7 @@ cc_library(
         ":profiler_interface",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_controller",
     ],
 )
@@ -138,6 +141,7 @@ cc_library(
         ":traceme_encode",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
@@ -153,6 +157,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
@@ -177,7 +182,7 @@ cc_library(
         ":context_types_hdrs",
         ":traceme",
         ":traceme_encode",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
@@ -188,14 +193,20 @@ cc_library(
     name = "context_types_hdrs",
     hdrs = ["context_types.h"],
     visibility = ["//visibility:public"],
-    deps = ["@local_tsl//tsl/profiler/lib:context_types_hdrs"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
+    ],
 )
 
 cc_library(
     name = "context_types",
     hdrs = ["context_types.h"],
     visibility = ["//visibility:public"],
-    deps = ["@local_tsl//tsl/profiler/lib:context_types"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@local_tsl//tsl/profiler/lib:context_types",
+    ],
 )
 
 cc_library(
@@ -203,6 +214,7 @@ cc_library(
     hdrs = ["scoped_memory_debug_annotation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:scoped_memory_debug_annotation",
     ],
 )
@@ -215,6 +227,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ] + if_not_android([
@@ -230,6 +243,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base:core_headers",
         "@local_tsl//tsl/profiler/lib:profiler_lock",
     ],
 )
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index aa6d354420038b..e696cdaf4682d0 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/profiler/lib/context_types.h"
@@ -25,11 +26,18 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tsl/profiler/lib/connected_traceme.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::TraceMeConsumer;  // NOLINT
-using tsl::profiler::TraceMeProducer;  // NOLINT
+using TraceMeConsumer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeConsumer;  // NOLINT
+using TraceMeProducer ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeProducer;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/context_types.h b/tensorflow/core/profiler/lib/context_types.h
index 99b2029fd09f82..434a02c22dd912 100644
--- a/tensorflow/core/profiler/lib/context_types.h
+++ b/tensorflow/core/profiler/lib/context_types.h
@@ -15,14 +15,31 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_CONTEXT_TYPES_H_
 
+#include <cstdint>
+
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/context_types.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ContextType;           // NOLINT
-using tsl::profiler::GetContextTypeString;  // NOLINT
-using tsl::profiler::GetSafeContextType;    // NOLINT
+using ContextType ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ContextType;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline const char* GetContextTypeString(ContextType context_type) {
+  return tsl::profiler::GetContextTypeString(context_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline ContextType GetSafeContextType(uint32_t context_type) {
+  return tsl::profiler::GetSafeContextType(context_type);
+}
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_controller.h b/tensorflow/core/profiler/lib/profiler_controller.h
index 5993c6b42bec05..21936dcd46dc9c 100644
--- a/tensorflow/core/profiler/lib/profiler_controller.h
+++ b/tensorflow/core/profiler/lib/profiler_controller.h
@@ -17,15 +17,22 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/lib/profiler_controller.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerController;  // NOLINT
+using ProfilerController ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerController;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_factory.h b/tensorflow/core/profiler/lib/profiler_factory.h
index 14d32919190d2d..54b6fb5577cf6a 100644
--- a/tensorflow/core/profiler/lib/profiler_factory.h
+++ b/tensorflow/core/profiler/lib/profiler_factory.h
@@ -19,26 +19,43 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/protobuf/profiler_options.pb.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
 // A ProfilerFactory returns an instance of ProfilerInterface if ProfileOptions
 // require it. Otherwise, it might return nullptr.
-using tsl::profiler::ProfilerFactory;  // NOLINT
+using ProfilerFactor ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerFactory;  // NOLINT
 
 // Registers a profiler factory. Should be invoked at most once per factory.
-using tsl::profiler::RegisterProfilerFactory;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline void RegisterProfilerFactory(ProfilerFactor factory) {
+  tsl::profiler::RegisterProfilerFactory(factory);
+}
 
 // Invokes all registered profiler factories with the given options, and
 // returns the instantiated (non-null) profiler interfaces.
-using tsl::profiler::CreateProfilers;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline std::vector<std::unique_ptr<profiler::ProfilerInterface>>
+CreateProfilers(const tensorflow::ProfileOptions& options) {
+  return tsl::profiler::CreateProfilers(options);
+}
 
 // For testing only.
-using tsl::profiler::ClearRegisteredProfilersForTest;  // NOLINT
+ABSL_DEPRECATE_AND_INLINE()
+inline void ClearRegisteredProfilersForTest() {
+  tsl::profiler::ClearRegisteredProfilersForTest();
+}
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_interface.h b/tensorflow/core/profiler/lib/profiler_interface.h
index ee3f9de6e5a02b..11423c1ab212ff 100644
--- a/tensorflow/core/profiler/lib/profiler_interface.h
+++ b/tensorflow/core/profiler/lib/profiler_interface.h
@@ -15,14 +15,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerInterface;  // NOLINT
+using ProfilerInterface ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerInterface;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_lock.h b/tensorflow/core/profiler/lib/profiler_lock.h
index 1bf378d6226446..7480df5887ec1f 100644
--- a/tensorflow/core/profiler/lib/profiler_lock.h
+++ b/tensorflow/core/profiler/lib/profiler_lock.h
@@ -15,13 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 
+#include "absl/base/macros.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tsl/profiler/lib/profiler_lock.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ProfilerLock;  // NOLINT
+using ProfilerLock ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ProfilerLock;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1260c19b4c4c12..76099cc1b07ce3 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -15,11 +15,18 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
 
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/profiler_session.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 
-using tsl::ProfilerSession;  // NOLINT
+using ProfilerSession ABSL_DEPRECATE_AND_INLINE() =
+    tsl::ProfilerSession;  // NOLINT
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index b3336ba6397b14..fc92c39e676e52 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
@@ -30,10 +31,16 @@ limitations under the License.
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
 #endif
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::ScopedAnnotation;  // NOLINT
+using ScopedAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedAnnotation;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
index 1156aecc359511..e44cdb3c487ec4 100644
--- a/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h
@@ -20,13 +20,21 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "tsl/profiler/lib/scoped_memory_debug_annotation.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::MemoryDebugAnnotation;        // NOLINT
-using tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
+using MemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::MemoryDebugAnnotation;  // NOLINT
+using ScopedMemoryDebugAnnotation ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::ScopedMemoryDebugAnnotation;  // NOLINT
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index c5bf36e2b82be3..a5209317d62317 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -31,16 +32,26 @@ limitations under the License.
 #include "tsl/profiler/utils/time_utils.h"
 #endif
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::GetTFTraceMeLevel;   // NOLINT
-using tsl::profiler::kCritical;           // NOLINT
-using tsl::profiler::kInfo;               // NOLINT
-using tsl::profiler::kVerbose;            // NOLINT
-using tsl::profiler::TfOpDetailsEnabled;  // NOLINT
-using tsl::profiler::TraceMe;             // NOLINT
-using tsl::profiler::TraceMeLevel;        // NOLINT
+using tsl::profiler::kInfo;                                          // NOLINT
+using TraceMe ABSL_DEPRECATE_AND_INLINE() = tsl::profiler::TraceMe;  // NOLINT
+using TraceMeLevel ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeLevel;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline int GetTFTraceMeLevel(bool is_expensive) {
+  return tsl::profiler::GetTFTraceMeLevel(is_expensive);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline bool TfOpDetailsEnabled() { return tsl::profiler::TfOpDetailsEnabled(); }
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index dd0e87f7f09f43..5f1bb253ccb611 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <initializer_list>
 #include <string>
 
+#include "absl/base/macros.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -27,13 +28,68 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tensorflow {
 namespace profiler {
 
-using tsl::profiler::TraceMeArg;         // NOLINT
-using tsl::profiler::TraceMeEncode;      // NOLINT
-using tsl::profiler::TraceMeOp;          // NOLINT
-using tsl::profiler::TraceMeOpOverride;  // NOLINT
+using TraceMeArg ABSL_DEPRECATE_AND_INLINE() =
+    tsl::profiler::TraceMeArg;  // NOLINT
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(std::string name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(absl::string_view name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(const char* name,
+                                 std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(name, args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeEncode(std::initializer_list<TraceMeArg> args) {
+  return tsl::profiler::TraceMeEncode(args);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOp(absl::string_view op_name,
+                             absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOp(std::string&& op_name, absl::string_view op_type) {
+  return tsl::profiler::TraceMeOp(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+// Concatenates op_name and op_type.
+inline std::string TraceMeOpOverride(absl::string_view op_name,
+                                     absl::string_view op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
+
+ABSL_DEPRECATE_AND_INLINE()
+inline std::string TraceMeOpOverride(const char* op_name, const char* op_type) {
+  return tsl::profiler::TraceMeOpOverride(op_name, op_type);
+}
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 03cc7b8f1e1e0c..068d599d5b5f1b 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_external_workspace_visible", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/profiler/builds:build_config.bzl",
     "tf_profiler_alias",
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index ce85502e5a3273..82e68a58529c2c 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
index b104bffff89690..c84227b59f7468 100644
--- a/tensorflow/core/profiler/rpc/oss/BUILD
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.default.bzl", "tf_grpc_cc_dependencies")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index b3b363375f1bf4..a3f7d16a834cef 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -239,7 +239,7 @@ cc_library(
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
         "@local_tsl//tsl/profiler/utils:timespan",
         "@local_tsl//tsl/profiler/utils:tpu_xplane_utils",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 3e28bac0c766cf..981b6a0c54e8de 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/gpu_event_stats.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "tsl/profiler/utils/tf_xplane_visitor.h"
 #include "tsl/profiler/utils/timespan.h"
 #include "tsl/profiler/utils/tpu_xplane_utils.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.h b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
index 585676b690908c..c75bd71fcefc38 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.h
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_METRICS_DB_UTILS_H_
 
 #include <algorithm>
+#include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -121,7 +122,7 @@ inline uint64_t ChildrenTimePs(const OpMetrics& metrics) {
 
 // Returns the ratio of time spent sending data from the host to the device
 // relative to the total time the host was active.
-absl::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
+std::optional<double> HostInfeedEnqueueRatio(const OpMetricsDb& db);
 
 // Converts from the device op metrics to Tf-op metrics.
 OpMetricsDb CreateTfMetricsDbFromDeviceOpMetricsDb(
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 76f6c6551f8d74..86bf0017f3cfda 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -1,6 +1,6 @@
 # copybara:uncomment_begin(oss-unused)
-# load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # load("//net/grpc:cc_grpc_library.bzl", "cc_grpc_library")
+# load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # copybara:uncomment_end
 
 # For platform specific build config
@@ -216,6 +216,7 @@ tf_proto_library(
         "@local_tsl//tsl/protobuf:status_proto",
     ],
     tags = ["alt_dep=//third_party/tensorflow/core:protos_all"],
+    visibility = ["//visibility:public"],
     exports = [
         "@local_tsl//tsl/protobuf:bfc_memory_map_proto",
         "@local_tsl//tsl/protobuf:rpc_options_proto",
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 2f5e5127f2fc60..377b53c48d04b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -265,6 +265,14 @@ message GPUOptions {
     // system memory size for better resource estimation of multi-tenancy(one
     // gpu with multiple model) use case.
     int32 gpu_system_memory_size_in_mb = 16;
+
+    // If true, save information needed for created a PjRt GPU client for
+    // creating a client with remote devices.
+    bool populate_pjrt_gpu_client_creation_info = 17;
+
+    // node_id for use when creating a PjRt GPU client with remote devices,
+    // which enumerates jobs*tasks from a ServerDef.
+    int32 node_id = 18;
   }
 
   // Everything inside experimental is subject to change and is not subject
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 9f4042e6f8be9b..f98d1928d9e156 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -102,8 +102,8 @@ message RewriterConfig {
   // Enable the swap of kernel implementations based on the device placement
   // (default is ON).
   Toggle implementation_selector = 22;
-  // Optimize data types for CUDA (default is OFF).
-  // This will try to use float16 on GPU which is faster.
+  // Optimize data types for CUDA/oneDNN (default is OFF).
+  // This will try to use float16 on GPU/CPU which is faster.
   // Note that this can change the numerical stability of the graph and may
   // require the use of loss scaling to maintain model convergence.
   Toggle auto_mixed_precision = 23;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8fa4e8122aab1e..bb798e7845959e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1803  // Updated: 2024/3/16
+#define TF_GRAPH_DEF_VERSION 1819  // Updated: 2024/4/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index da33d97329ad0b..40b841ce0033ea 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -60,7 +60,7 @@ cc_library(
     testonly = True,
     srcs = ["bef_executor_flags.cc"],
     hdrs = ["bef_executor_flags.h"],
-    visibility = ["//third_party/tf_runtime_google:__pkg__"],
+    visibility = ["//visibility:private"],
     deps = [
         "@com_google_absl//absl/flags:flag",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index 92db3499ec3977..fb8acb0b423e45 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -207,6 +207,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:errors",
+        "@com_google_googletest//:gtest_main",
         "@tf_runtime//:bef",
         "@tf_runtime//:bef_attr_encoder",
         "@tf_runtime//:core_runtime",
diff --git a/tensorflow/core/runtime_fallback/util/attr_util_test.cc b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
index 85bbfeb9cc0907..2beeb0848600a7 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -280,10 +281,14 @@ TEST(UtilsTest, IsUnusedAttributeOk) {
 TEST(UtilsTest, FillAttrValueMapOk) {
   tfrt::OpAttrs attrs;
   attrs.SetArray("shape", tfrt::ArrayRef<int64_t>{2, 2});
-  attrs.SetArray("values", tfrt::ArrayRef<int64_t>{2});
+  attrs.SetArray("values", tfrt::ArrayRef<float>{2});
+  attrs.SetArray("flags", tfrt::ArrayRef<bool>{false, true});
+
   attrs.Set<bool>("transpose_a", false);
   attrs.Set<bool>("transpose_b", true);
-  attrs.Set<int64_t>("result_segment_sizes", 1);
+  attrs.Set<int64_t>("result_segment_sizes", 2);  // unused
+  attrs.Set<float>("foo", 2);
+  attrs.Set<int64_t>("bar", 2);
 
   AttrValueMap map;
   auto host_context = CreateTestHostContext();
@@ -292,12 +297,16 @@ TEST(UtilsTest, FillAttrValueMapOk) {
   ASSERT_FALSE(llvm::errorToBool(
       FillAttrValueMap(attrs.freeze(), host_context.get(), &map)));
 
-  EXPECT_THAT(map,
-              UnorderedElementsAre(
-                  Pair(Eq("shape"), EqualsProto(R"pb(list { i: 2 i: 2 })pb")),
-                  Pair(Eq("values"), EqualsProto(R"pb(list { i: 2 })pb")),
-                  Pair(Eq("transpose_a"), EqualsProto(R"pb(b: false)pb")),
-                  Pair(Eq("transpose_b"), EqualsProto(R"pb(b: true)pb"))));
+  EXPECT_THAT(
+      map,
+      UnorderedElementsAre(
+          Pair(Eq("shape"), EqualsProto(R"pb(list { i: 2 i: 2 })pb")),
+          Pair(Eq("values"), EqualsProto(R"pb(list { f: 2 })pb")),
+          Pair(Eq("flags"), EqualsProto(R"pb(list { b: false b: true })pb")),
+          Pair(Eq("transpose_a"), EqualsProto(R"pb(b: false)pb")),
+          Pair(Eq("transpose_b"), EqualsProto(R"pb(b: true)pb")),
+          Pair(Eq("foo"), EqualsProto(R"pb(f: 2)pb")),
+          Pair(Eq("bar"), EqualsProto(R"pb(i: 2)pb"))));
 }
 
 }  // namespace
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 5f66bb91816858..81b600f036716c 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,13 +1,13 @@
 # Description:
 #   C++ implementation code for the summary writing APIs.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
     "tf_cc_test",
     "tf_copts",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 48ad12ffda86f0..2de9b7fcf904b3 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -1,9 +1,9 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -85,8 +85,12 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:local_device_state",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:tf_pjrt_client",
+        "@local_xla//xla/stream_executor/integrations:tf_allocator_adapter",
     ],
 )
 
@@ -107,6 +111,8 @@ cc_library(
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@local_xla//xla/pjrt:pjrt_client",
     ],
 )
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 86d2a3ffeaf377..c7fb3304d42bc6 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/tf_pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
@@ -80,6 +81,18 @@ Status PjRtState::MovePjRtClientToUnused(const DeviceType& device_type) {
                           device_type);
 }
 
+Status PjRtState::SetPjRtGpuClientCreationInfo(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info) {
+  absl::MutexLock lock(&mu_);
+  pjrt_gpu_client_creation_info_ = std::move(info);
+  return absl::OkStatus();
+}
+
+PjRtGpuClientCreationInfo* PjRtState::GetPjRtGpuClientCreationInfo() {
+  absl::MutexLock lock(&mu_);
+  return pjrt_gpu_client_creation_info_.get();
+}
+
 string PjRtState::DebugString() const { return "PjRtState"; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 0fd411f2cd683c..180163376b4cd2 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -19,7 +19,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "xla/client/local_client.h"
+#include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
 
@@ -28,6 +31,16 @@ namespace tensorflow {
 const char kPjRtStateResourceName[] = "pjrt_state";
 using PjRtClientsMap = std::map<DeviceType, std::unique_ptr<xla::PjRtClient>>;
 
+// Information needed to create a PjRt GPU Client which is used when creating
+// a client after after information about remote devices is available.
+struct PjRtGpuClientCreationInfo {
+  std::set<int> allowed_devices;
+  std::unique_ptr<se::MultiDeviceAdapter> allocator;
+  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
+  xla::LocalClient* local_client;
+};
+
 // The class for the state related to PjRt. It contains a map from `DeviceType`
 // to `PjRtClient`. It will be stored in the global `ResourceManager`.
 class PjRtState : public ResourceBase {
@@ -43,6 +56,15 @@ class PjRtState : public ResourceBase {
   Status MovePjRtClientToUnused(const DeviceType& device_type);
   string DebugString() const override;
 
+  // Saves information needed to create a PJRT client (to enable creating a
+  // client with remote devices).
+  absl::Status SetPjRtGpuClientCreationInfo(
+      std::unique_ptr<PjRtGpuClientCreationInfo> info);
+
+  // Retrieves information needed to create a PJRT client (for creating a
+  // client with remote devices).
+  PjRtGpuClientCreationInfo* GetPjRtGpuClientCreationInfo();
+
  private:
   explicit PjRtState() {}
   absl::Mutex mu_;
@@ -50,6 +72,9 @@ class PjRtState : public ResourceBase {
   // Store the PJRT clients that are no longer used to guarantee that PJRT
   // clients outlive PJRT buffers.
   std::vector<std::unique_ptr<xla::PjRtClient>> unused_ ABSL_GUARDED_BY(mu_);
+
+  std::unique_ptr<PjRtGpuClientCreationInfo> pjrt_gpu_client_creation_info_
+      ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util.cc b/tensorflow/core/tfrt/common/pjrt_util.cc
index 4a4832a81d649f..d679e92eed3f3b 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.cc
+++ b/tensorflow/core/tfrt/common/pjrt_util.cc
@@ -17,7 +17,10 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
+#include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
@@ -57,4 +60,34 @@ absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type) {
   return pjrt_state->GetPjRtClient(device_type);
 }
 
+absl::Status SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info) {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  PjRtState* pjrt_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<PjRtState>(
+      rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
+      [&](PjRtState** ret) {
+        *ret = PjRtState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref pjrt_state_ref(pjrt_state);
+  if (info == nullptr) {
+    return absl::InvalidArgumentError("PJRT client creation info is nullptr.");
+  }
+  TF_RETURN_IF_ERROR(pjrt_state->SetPjRtGpuClientCreationInfo(std::move(info)));
+  return absl::OkStatus();
+}
+
+absl::StatusOr<PjRtGpuClientCreationInfo*> GetPjRtGpuClientCreationInfo() {
+  ResourceMgr* rmgr = tfrt_global::GetTFGlobalResourceMgr();
+  PjRtState* pjrt_state;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<PjRtState>(
+      rmgr->default_container(), kPjRtStateResourceName, &pjrt_state,
+      [&](PjRtState** ret) {
+        *ret = PjRtState::Create();
+        return absl::OkStatus();
+      }));
+  core::ScopedUnref pjrt_state_ref(pjrt_state);
+  return pjrt_state->GetPjRtGpuClientCreationInfo();
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_util.h b/tensorflow/core/tfrt/common/pjrt_util.h
index 75ce2c94b01ae6..ce9cbc1d11c287 100644
--- a/tensorflow/core/tfrt/common/pjrt_util.h
+++ b/tensorflow/core/tfrt/common/pjrt_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/common/pjrt_state.h"
 
 namespace tensorflow {
 
@@ -36,6 +37,10 @@ Status SetPjRtClientInTFGlobalResourceManager(
 // TFGlobalResourceManager.
 absl::StatusOr<xla::PjRtClient*> GetPjRtClient(const DeviceType& device_type);
 
+Status SetPjRtGpuClientCreationInfoInTFGlobalResourceManager(
+    std::unique_ptr<PjRtGpuClientCreationInfo> info);
+absl::StatusOr<PjRtGpuClientCreationInfo*> GetPjRtGpuClientCreationInfo();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TFRT_COMMON_PJRT_UTIL_H_
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 9ec53499c285b4..454c102deec081 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -1,4 +1,3 @@
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_mobile",
@@ -6,13 +5,14 @@ load(
     "tf_cc_test",
     "tf_features_nolayering_check_if_ios",
 )
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index d363360bd78353..366508e8719ba0 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
@@ -132,6 +133,12 @@ StepId GetNextStepId() {
   return gen.GetNextStepId();
 }
 
+auto* graph_executor_mode = monitoring::Gauge<std::string, 2>::New(
+    "/tfrt/graph_executor/mode",
+    "Record the total number of imported savedmodel using different graph "
+    "executor modes (BEF vs MLRT interpreter)",
+    "model_name", "model_version");
+
 }  // namespace
 
 tensorflow::Status RunMlrtFunction(
@@ -496,14 +503,16 @@ absl::StatusOr<std::unique_ptr<GraphExecutor>> GraphExecutor::Create(
     // Overrides cost_analysis_options.
     options.cost_analysis_options.version = Options::CostAnalysisOptions::kOnce;
   }
-
   TfrtGraphExecutionState::Options graph_execution_state_options;
   graph_execution_state_options.run_placer_grappler_on_functions =
       options.run_placer_grappler_on_functions;
 
   options.compile_options.fuse_get_resource_ops_in_hoisting =
       !options.enable_mlrt;
-
+  graph_executor_mode
+      ->GetCell(options.model_metadata.name(),
+                absl::StrCat(options.model_metadata.version()))
+      ->Set(options.enable_mlrt ? "mlrt" : "bef");
   TF_ASSIGN_OR_RETURN(
       auto graph_execution_state,
       TfrtGraphExecutionState::Create(graph_execution_state_options,
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index e0ecfe019f9994..8e9e7c060b2fa6 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -32,6 +32,7 @@ cc_library(
         ":ifrt_loaded_variable_registry",
         ":ifrt_tensor_utils",
         ":sharding_utils",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf2hlo",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
@@ -99,6 +100,7 @@ cc_library(
     srcs = ["ifrt_loaded_variable_registry.cc"],
     hdrs = ["ifrt_loaded_variable_registry.h"],
     deps = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
index 213d75431df6c9..cdc8d669df989e 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
@@ -24,8 +24,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/python/ifrt/array.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -36,7 +34,7 @@ absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
     LoadedVariableConstructor&& loaded_variable_constructor) {
   absl::MutexLock lock(&mutex_);
   auto& variable = loaded_variable_map_[name];
-  if (variable != nullptr) {
+  if (variable.array.IsValid()) {
     // Already registered. This is rare.
     VLOG(1) << "Variable '" << name << "' already registered.";
     return absl::OkStatus();
@@ -45,7 +43,7 @@ absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
   return absl::OkStatus();
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable>
 IfrtLoadedVariableRegistry::GetLoadedVariable(absl::string_view name) const {
   absl::MutexLock lock(&mutex_);
   auto it = loaded_variable_map_.find(name);
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
index ccfc4aa3d46a1b..5cb885598174bd 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -22,7 +22,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/future.h"
 #include "tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
@@ -31,9 +33,12 @@ namespace ifrt_serving {
 // This class is thread safe.
 class IfrtLoadedVariableRegistry {
  public:
+  struct LoadedVariable {
+    DtypeAndShape dtype_and_shape;
+    xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>> array;
+  };
   using LoadedVariableConstructor =
-      absl::AnyInvocable<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>()
-                             const>;
+      absl::AnyInvocable<absl::StatusOr<LoadedVariable>() const>;
 
   // Tries to register a loaded variable with the given name.
   // Returns an error if the named array does not already exists and
@@ -45,13 +50,13 @@ class IfrtLoadedVariableRegistry {
       LoadedVariableConstructor&& loaded_variable_constructor)
       ABSL_LOCKS_EXCLUDED(mutex_);
 
-  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetLoadedVariable(
-      absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
+  absl::StatusOr<LoadedVariable> GetLoadedVariable(absl::string_view name) const
+      ABSL_LOCKS_EXCLUDED(mutex_);
 
  private:
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, tsl::RCReference<xla::ifrt::Array>>
-      loaded_variable_map_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<std::string, LoadedVariable> loaded_variable_map_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
index dd1ef572bb5e4d..067ccfe2365ea2 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -91,9 +91,11 @@ class IfrtModelContext {
     return restore_tensor_registry_;
   }
 
-  tfrt::ConcurrentWorkQueue* work_queue() const { return work_queue_; }
-  void set_work_queue(tfrt::ConcurrentWorkQueue* work_queue) {
-    work_queue_ = work_queue;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue() const {
+    return checkpoint_loader_queue_;
+  }
+  void set_checkpoint_loader_queue(tfrt::ConcurrentWorkQueue* work_queue) {
+    checkpoint_loader_queue_ = work_queue;
   }
 
  private:
@@ -103,7 +105,7 @@ class IfrtModelContext {
       tensorflow::IdentityShapeRepresentationFn();
 
   // Dedicated work queue for heavy task such as variable tensor restoration.
-  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
+  tfrt::ConcurrentWorkQueue* checkpoint_loader_queue_ = nullptr;
 
   std::vector<ServingExecutableRegistry::Handle> handles_;
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 847b6ea5bbb88c..bb9176c3b3de6f 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -30,6 +29,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -37,10 +37,8 @@ limitations under the License.
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
-#include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
-#include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
@@ -74,12 +72,10 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
       // Get already loaded variable tensor.
-      TF_ASSIGN_OR_RETURN(auto single_array,
+      TF_ASSIGN_OR_RETURN(auto loaded_variable,
                           ifrt_loaded_variable_registry.GetLoadedVariable(
                               inputs[i].scalar<tsl::tstring>()()));
-      TF_ASSIGN_OR_RETURN(auto dtype, ToTensorDataType(single_array->dtype()));
-      dtypes_and_shapes.push_back(DtypeAndShape{
-          .dtype = dtype, .shape = ToTensorShape(single_array->shape())});
+      dtypes_and_shapes.push_back(loaded_variable.dtype_and_shape);
 
       variable_index++;
     } else {
@@ -91,37 +87,21 @@ absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
 }
 
 absl::StatusOr<xla::DeviceAssignment> GetXlaDeviceAssignment(
-    const xla::ifrt::Client& ifrt_client,
     const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
-  int num_replicas = compile_metadata.num_replicas();
-  int num_partitions = compile_metadata.num_cores_per_replica();
-
-  VLOG(2) << " Number of replcas is " << num_replicas
-          << " and num_partitions is " << num_partitions;
-
-  if (num_replicas > 1) {
-    return absl::UnimplementedError(
-        absl::StrCat("Only support single replica, but replica number is ",
-                     num_replicas, " and num_partitions is ", num_partitions));
-  }
-
-  if (compile_metadata.has_device_assignment()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::DeviceAssignment> da,
-                        xla::DeviceAssignment::Deserialize(
-                            compile_metadata.device_assignment()));
-
-    return *std::move(da);
-  } else {
-    // TODO(b/316068010): integrate core selection.
-    return ifrt_client.GetDefaultDeviceAssignment(num_replicas, num_partitions);
+  if (!compile_metadata.has_device_assignment()) {
+    return absl::InternalError("No device assignment found.");
   }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::DeviceAssignment> da,
+      xla::DeviceAssignment::Deserialize(compile_metadata.device_assignment()));
+  return *da;
 }
 
 absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
     const xla::ifrt::Client& ifrt_client,
     const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
   TF_ASSIGN_OR_RETURN(auto device_assignment,
-                      GetXlaDeviceAssignment(ifrt_client, compile_metadata));
+                      GetXlaDeviceAssignment(compile_metadata));
 
   const int num_devices =
       device_assignment.replica_count() * device_assignment.computation_count();
@@ -177,9 +157,8 @@ IfrtServingExecutable::CreateExecutableSynchronously(
                      num_replicas, " and num_partitions is ", num_partitions));
   }
 
-  TF_ASSIGN_OR_RETURN(
-      xla::DeviceAssignment da,
-      GetXlaDeviceAssignment(*ifrt_client_, tf2hlo_result.compile_metadata));
+  TF_ASSIGN_OR_RETURN(xla::DeviceAssignment da,
+                      GetXlaDeviceAssignment(tf2hlo_result.compile_metadata));
 
   VLOG(2) << "Device assignment :" << da.ToString();
 
@@ -304,9 +283,11 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   for (int i = 0; i < inputs.size(); i++) {
     if (variable_index < variable_arg_indices.size() &&
         i == variable_arg_indices[variable_index]) {
-      TF_ASSIGN_OR_RETURN(auto single_array,
+      TF_ASSIGN_OR_RETURN(auto loaded_variable,
                           ifrt_loaded_variable_registry_.GetLoadedVariable(
                               inputs[i].scalar<tsl::tstring>()()));
+      TF_ASSIGN_OR_RETURN(tsl::RCReference<xla::ifrt::Array> single_array,
+                          loaded_variable.array.Await());
       args.push_back(single_array);
       variable_index++;
     } else {
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 94c67be2260972..5d225f4c8db598 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/test_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
@@ -348,15 +349,27 @@ TEST_P(VariableInputTest, InterleaveVariable) {
       std::string variable_name = absl::StrCat("variable_", i);
       ASSERT_OK(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
           variable_name,
-          [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+          [&]() -> absl::StatusOr<IfrtLoadedVariableRegistry::LoadedVariable> {
+            tensorflow::Tensor in_tensor = GetParam().in_tensors[i];
             TF_ASSIGN_OR_RETURN(
                 tsl::RCReference<xla::ifrt::Array> array,
-                MakeArrayFromTensor(*client, GetParam().in_tensors[i],
+                MakeArrayFromTensor(*client, in_tensor,
                                     /*device_ids=*/{0},
                                     xla::HloSharding::Replicate(),
                                     GetThreadPool()));
 
-            return array;
+            auto promise = xla::ifrt::Future<absl::StatusOr<
+                tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+            auto future = xla::ifrt::Future<
+                absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(promise);
+            promise.Set(array);
+
+            IfrtLoadedVariableRegistry::LoadedVariable loaded_variable;
+            loaded_variable.array = future;
+            loaded_variable.dtype_and_shape.dtype = in_tensor.dtype();
+            loaded_variable.dtype_and_shape.shape = in_tensor.shape();
+
+            return loaded_variable;
           }));
       loaded_variable_indices.push_back(i);
 
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index d7084120b3e4a5..304f637952aec3 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -23,6 +23,7 @@ cc_library(
         ":context",
         ":kernel_runner_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:tensor_proto_cc",
         "//tensorflow/core/tfrt/mlrt/bytecode:function",
         "//tensorflow/core/tfrt/mlrt/interpreter:async_handle",
         "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
@@ -38,6 +39,8 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@tf_runtime//:hostcontext",
@@ -51,6 +54,7 @@ cc_library(
         ":context",
         ":kernel",
         ":kernel_runner_utils",
+        "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:ifrt_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core/framework:attr_value_proto_cc",
@@ -59,6 +63,7 @@ cc_library(
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_loaded_variable_registry",
         "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
         "//tensorflow/core/tfrt/ifrt:ifrt_restore_tensor_registry",
         "//tensorflow/core/tfrt/ifrt:sharding_utils",
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
index 99f6ec4727b9d1..47d1bd03e22758 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_types.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/future.h"
@@ -34,14 +35,13 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_handle.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
-#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
@@ -90,6 +90,27 @@ std::string GetRuntimeNameFromVarHandle(const ResourceHandle& handle) {
   return absl::StrCat(handle.container(), "__", handle.name());
 }
 
+absl::StatusOr<ifrt_serving::DtypeAndShape> GetDtypeAndShape(
+    const ResourceHandle& variable) {
+  std::vector<DtypeAndPartialTensorShape> dtype_and_partial_shapes =
+      variable.dtypes_and_shapes();
+
+  if (dtype_and_partial_shapes.size() != 1) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expected 1 dtype and shape, got ", dtype_and_partial_shapes.size()));
+  }
+  ifrt_serving::DtypeAndShape dtype_and_shape;
+  if (!dtype_and_partial_shapes.front().shape.AsTensorShape(
+          &dtype_and_shape.shape)) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Failed to convert partial shape to full tensor shape: ",
+                     dtype_and_partial_shapes.front().shape.DebugString()));
+  }
+
+  dtype_and_shape.dtype = dtype_and_partial_shapes.front().dtype;
+  return dtype_and_shape;
+}
+
 struct MlrtIfrtRestoreVariableKernel : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
 
@@ -218,9 +239,9 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
   }
 
   // Use dedicated work queue for restore operation.
-  DCHECK((*ifrt_model_context)->work_queue() != nullptr);
+  DCHECK((*ifrt_model_context)->checkpoint_loader_queue() != nullptr);
   (*ifrt_model_context)
-      ->work_queue()
+      ->checkpoint_loader_queue()
       ->AddTask(
           [runner = std::move(runner), async_state = std::move(async_state)]() {
             auto* op_kernel_context_ptr = &async_state->context;
@@ -241,7 +262,8 @@ void MlrtIfrtRestoreVariableKernel::Invoke() {
           });
 }
 
-struct MlrtIfrtLoadVariableKernel : mlrt::KernelFrame {
+class MlrtIfrtLoadVariableKernel : public mlrt::KernelFrame {
+ public:
   using KernelFrame::KernelFrame;
 
   static constexpr char kName[] = "tf_mlrt.ifrt_load_variable";
@@ -258,16 +280,23 @@ struct MlrtIfrtLoadVariableKernel : mlrt::KernelFrame {
     DCHECK_EQ(attributes().size(), 2);
     return attributes().GetAs<mlrt::bc::String>(0).Get();
   }
-  absl::string_view name() const {
-    DCHECK_EQ(attributes().size(), 2);
-    return attributes().GetAs<mlrt::bc::String>(1).Get();
-  }
 
   Context& context() { return execution_context().GetUserContext<Context>(); }
   void Invoke();
+
+ private:
+  absl::Status InvokeHelper();
 };
 
 void MlrtIfrtLoadVariableKernel::Invoke() {
+  absl::Status status = InvokeHelper();
+  if (!status.ok()) {
+    execution_context().Fail(std::move(status));
+    return;
+  }
+}
+
+absl::Status MlrtIfrtLoadVariableKernel::InvokeHelper() {
   DCHECK_EQ(1, results().size());
   std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
       ifrt_model_context =
@@ -276,36 +305,74 @@ void MlrtIfrtLoadVariableKernel::Invoke() {
               .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
                   "IfrtModelContext");
   if (!ifrt_model_context.has_value()) {
-    execution_context().Fail(absl::FailedPreconditionError(
-        "LoadVariableOp: failed to fetch IfrtModelContext: "));
-    return;
+    return absl::FailedPreconditionError(
+        "LoadVariableOp: failed to fetch IfrtModelContext: ");
   }
 
-  auto status =
+  // TODO(b/319045348): remove name() attribute. we now gets name from variable
+  // handle.
+  std::string runtime_name = GetRuntimeNameFromVarHandle(variable());
+  xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>> restored_tensor_future =
+      (*ifrt_model_context)->GetRestoreTensorRegistry().Get(runtime_name);
+  if (!restored_tensor_future.IsValid()) {
+    return absl::InternalError(absl::StrCat(
+        "LoadVariableOp: failed to fetch variable tensor: ", runtime_name));
+  }
+
+  auto loaded_variable_promise = xla::ifrt::Future<
+      absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>::CreatePromise();
+  auto loaded_variable_future =
+      xla::ifrt::Future<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>>(
+          loaded_variable_promise);
+
+  TF_ASSIGN_OR_RETURN(ifrt_serving::DtypeAndShape dtype_and_shape,
+                      GetDtypeAndShape(variable()));
+
+  TF_RETURN_IF_ERROR(
       (*ifrt_model_context)
           ->GetLoadedVariableRegistry()
           .TryRegisterLoadedVariable(
-              name(),
-              [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
-                core::RefCountPtr<Var> variable_resource;
-                TF_RETURN_IF_ERROR(
-                    LookupResource(&context().op_kernel_context(), variable(),
-                                   &variable_resource));
-
-                return LoadIfrtVariable(**ifrt_model_context,
-                                        *(variable_resource->tensor()),
-                                        sharding_config_proto_text(), name());
-              });
-  if (!status.ok()) {
-    execution_context().Fail(std::move(status));
-    return;
-  }
-
+              runtime_name,
+              [&]() -> absl::StatusOr<ifrt_serving::IfrtLoadedVariableRegistry::
+                                          LoadedVariable> {
+                return ifrt_serving::IfrtLoadedVariableRegistry::LoadedVariable(
+                    {.dtype_and_shape = dtype_and_shape,
+                     .array = loaded_variable_future});
+              }));
+
+  restored_tensor_future.OnReady(
+      [ifrt_model_context = *ifrt_model_context,
+       sharding_config = std::string(sharding_config_proto_text()),
+       runtime_name = runtime_name,
+       loaded_variable_promise = std::move(loaded_variable_promise)](
+          absl::StatusOr<tensorflow::Tensor> restored_tensor) mutable {
+        if (!restored_tensor.ok()) {
+          loaded_variable_promise.Set(std::move(restored_tensor).status());
+          return;
+        }
+
+        // Transfer tensor to array in a separate thread.
+        ifrt_model_context->checkpoint_loader_queue()->AddTask(
+            [ifrt_model_context, runtime_name = std::move(runtime_name),
+             sharding_config = std::move(sharding_config),
+             restored_tensor = std::move(*restored_tensor),
+             loaded_variable_promise =
+                 std::move(loaded_variable_promise)]() mutable {
+              absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+                  variable_array =
+                      LoadIfrtVariable(*ifrt_model_context, restored_tensor,
+                                       sharding_config, runtime_name);
+              loaded_variable_promise.Set(std::move(variable_array));
+            });
+      });
   // Return the name as the key
   tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
-  key_tensor.scalar<tsl::tstring>()() = std::string(name());
+  key_tensor.scalar<tsl::tstring>()() = runtime_name;
   results()[0].Set(tensorflow::tfrt_stub::FallbackTensor(key_tensor));
+
+  return absl::OkStatus();
 }
+
 void RegisterTfMlrtIfrtKernels(mlrt::KernelRegistry& registry) {
   registry.Register<MlrtIfrtLoadVariableKernel>();
   registry.Register<MlrtIfrtRestoreVariableKernel>();
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
index 69aec242d5d346..c76deb3bfb62c3 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/client.h"
@@ -70,7 +71,9 @@ using tensorflow::test::AsTensor;
 using tensorflow::test::ExpectEqual;
 using tensorflow::test::TensorEq;
 
-static absl::string_view kVariableName = "test_variable";
+constexpr absl::string_view kContainer = "test";
+constexpr absl::string_view kSharedName = "y";
+constexpr absl::string_view kVariableRuntimeName = "test__y";
 
 tsl::thread::ThreadPool& GetThreadPool() {
   constexpr int kMaxParallelism = 16;
@@ -115,26 +118,28 @@ mlrt::bc::Buffer CreateExecutableForIfrtRestoreVariableOp() {
   attributes.Add("restore_dtypes", restore_dtypes);
 
   attributes.Add("var_handle_op_node_def",
-                 R"pb(name: "VarHandleOp"
-                      op: "VarHandleOp"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "container"
-                        value { s: "test" }
-                      }
-                      attr {
-                        key: "shared_name"
-                        value { s: "y" }
-                      }
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                      attr {
-                        key: "shape"
-                        value { shape { dim { size: 1 } } }
-                      }
-                 )pb");
+                 absl::Substitute(
+                     R"pb(name: "VarHandleOp"
+                          op: "VarHandleOp"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "container"
+                            value { s: "$0" }
+                          }
+                          attr {
+                            key: "shared_name"
+                            value { s: "$1" }
+                          }
+                          attr {
+                            key: "dtype"
+                            value { type: DT_INT32 }
+                          }
+                          attr {
+                            key: "shape"
+                            value { shape { dim { size: 1 } } }
+                          }
+                     )pb",
+                     kContainer, kSharedName));
 
   attributes.Add("var_handle_op_key", 0);
 
@@ -216,7 +221,7 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   kernels.Def(kernel_names);
 
   mlrt::testing::AttributeTable attributes(
-      executable_ctor.construct_attributes(6));
+      executable_ctor.construct_attributes(4));
 
   tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
   sharding_config.add_device_ids(0);
@@ -226,46 +231,34 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
   printer.PrintToString(sharding_config, &serialized_sharding_config);
 
   attributes.Add("sharding_config", serialized_sharding_config);
-  attributes.Add("variable_name", kVariableName);
+  attributes.Add("variable_name", kVariableRuntimeName);
 
   attributes.Add("var_handle_op_node_def",
-                 R"pb(name: "VarHandleOp"
-                      op: "VarHandleOp"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "container"
-                        value { s: "test" }
-                      }
-                      attr {
-                        key: "shared_name"
-                        value { s: "y" }
-                      }
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                      attr {
-                        key: "shape"
-                        value { shape { dim { size: 1 } } }
-                      }
-                 )pb");
+                 absl::Substitute(
+                     R"pb(name: "VarHandleOp"
+                          op: "VarHandleOp"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "container"
+                            value { s: "$0" }
+                          }
+                          attr {
+                            key: "shared_name"
+                            value { s: "$1" }
+                          }
+                          attr {
+                            key: "dtype"
+                            value { type: DT_INT32 }
+                          }
+                          attr {
+                            key: "shape"
+                            value { shape { dim { size: 1 } } }
+                          }
+                     )pb",
+                     kContainer, kSharedName));
 
   attributes.Add("var_handle_op_key", 0);
 
-  attributes.Add("assign_variable_op_node_def",
-                 R"pb(name: "AssignVariableOp"
-                      op: "AssignVariableOp"
-                      input: "dummy_resource"
-                      input: "dummy_tensor"
-                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
-                      attr {
-                        key: "dtype"
-                        value { type: DT_INT32 }
-                      }
-                 )pb");
-
-  attributes.Add("assign_variable_op_key", 1);
-
   auto functions_ctor = executable_ctor.construct_functions(1);
 
   {
@@ -274,10 +267,9 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
 
     mlrt::testing::SymbolTable regs;
 
-    function_ctor.construct_input_regs(1).Assign({regs.Def("input_tensor")});
     function_ctor.construct_output_regs(1).Assign({regs.Def("output_tensor")});
 
-    const int kNumKernels = 6 + (redundant_ifrt_load_variable_op ? 1 : 0);
+    const int kNumKernels = 4 + (redundant_ifrt_load_variable_op ? 1 : 0);
     auto kernels_ctor = function_ctor.construct_kernels(kNumKernels);
     int kernel_index = 0;
 
@@ -292,17 +284,6 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
            attributes.GetHandle("var_handle_op_key")});
       kernel_index++;
     }
-    {
-      // Create AssignVariableOp
-      auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
-      createop_ctor.construct_arguments(0);
-      createop_ctor.construct_results(0);
-      createop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("assign_variable_op_node_def"),
-           attributes.GetHandle("assign_variable_op_key")});
-      kernel_index++;
-    }
     {
       // Execute VarHandleOp
       auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
@@ -312,24 +293,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       executeop_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("var_handle_op_node_def"),
            attributes.GetHandle("var_handle_op_key")});
-      executeop_ctor.construct_last_uses(1).Assign({0});
       kernel_index++;
     }
-
-    {
-      // Execute AssignVariableOp
-      auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
-      executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
-      executeop_ctor.construct_arguments(2).Assign(
-          regs.Use({"variable_handle", "input_tensor"}));
-      executeop_ctor.construct_results(0);
-      executeop_ctor.construct_attributes(2).Assign(
-          {attributes.GetHandle("assign_variable_op_node_def"),
-           attributes.GetHandle("assign_variable_op_key")});
-      executeop_ctor.construct_last_uses(2).Assign({0, 0});
-      kernel_index++;
-    }
-
     {
       auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
       kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
@@ -338,7 +303,8 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
            attributes.GetHandle("variable_name")});
-      kernel_ctor.construct_last_uses(1).Assign({1});
+      kernel_ctor.construct_last_uses(1).Assign(
+          {redundant_ifrt_load_variable_op ? 0 : 1});
       kernel_index++;
     }
     if (redundant_ifrt_load_variable_op) {
@@ -348,7 +314,7 @@ mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
       kernel_ctor.construct_attributes(2).Assign(
           {attributes.GetHandle("sharding_config"),
            attributes.GetHandle("variable_name")});
-      kernel_ctor.construct_arguments(1).Assign({regs.Use("input_tensor")});
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
       kernel_ctor.construct_last_uses(1).Assign({1});
       kernel_index++;
     }
@@ -417,18 +383,29 @@ TEST(KernelTest, IfrtLoadVariableOp) {
   ASSERT_TRUE(ifrt_model_context.has_value());
   EXPECT_THAT((*ifrt_model_context)
                   ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableName)
+                  .GetLoadedVariable(kVariableRuntimeName)
                   .status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
 
-  std::vector<mlrt::Value> args;
-  args.resize(1);
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
-  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  auto input_tensor_promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto input_tensor_future =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
+          input_tensor_promise);
+  input_tensor_promise.Set(input_tensor);
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetRestoreTensorRegistry()
+                   .TryRegister(kVariableRuntimeName, input_tensor_future));
 
-  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> args;
+  std::vector<uint8_t> last_uses;
   std::vector<mlrt::Value> results;
   results.resize(1);
 
@@ -446,11 +423,11 @@ TEST(KernelTest, IfrtLoadVariableOp) {
 
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableName)
+                   .GetLoadedVariable(kVariableRuntimeName)
                    .status());
 
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
-              AsScalar(tsl::tstring(kVariableName)));
+              AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
 TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
@@ -505,18 +482,29 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
   ASSERT_TRUE(ifrt_model_context.has_value());
   EXPECT_THAT((*ifrt_model_context)
                   ->GetLoadedVariableRegistry()
-                  .GetLoadedVariable(kVariableName)
+                  .GetLoadedVariable(kVariableRuntimeName)
                   .status(),
               ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
 
-  std::vector<mlrt::Value> args;
-  args.resize(1);
+  auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
+
   tensorflow::Tensor input_tensor;
   TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
   input_tensor.scalar<int32_t>()() = 1234;
-  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  auto input_tensor_promise =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>::CreatePromise();
+  auto input_tensor_future =
+      xla::ifrt::Future<absl::StatusOr<tensorflow::Tensor>>(
+          input_tensor_promise);
+  input_tensor_promise.Set(input_tensor);
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetRestoreTensorRegistry()
+                   .TryRegister(kVariableRuntimeName, input_tensor_future));
 
-  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> args;
+  std::vector<uint8_t> last_uses;
   std::vector<mlrt::Value> results;
   results.resize(1);
 
@@ -534,17 +522,14 @@ TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
 
   TF_ASSERT_OK((*ifrt_model_context)
                    ->GetLoadedVariableRegistry()
-                   .GetLoadedVariable(kVariableName)
+                   .GetLoadedVariable(kVariableRuntimeName)
                    .status());
 
   ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
-              AsScalar(tsl::tstring(kVariableName)));
+              AsScalar(tsl::tstring(kVariableRuntimeName)));
 }
 
 TEST(KernelTest, IfrtRestoreVariableOp) {
-  // runtime name = container_name + shared_name
-  constexpr absl::string_view kVariableRuntimeName = "test__y";
-
   std::string checkpoint_prefix =
       tensorflow::GetDataDependencyFilepath(
           "tensorflow/core/tfrt/mlrt/kernel/testdata/"
@@ -609,7 +594,7 @@ TEST(KernelTest, IfrtRestoreVariableOp) {
 
   auto restore_work_queue = tfrt::CreateMultiThreadedWorkQueue(
       /*num_threads=*/4, /*num_blocking_threads=*/4);
-  (*ifrt_model_context)->set_work_queue(restore_work_queue.get());
+  (*ifrt_model_context)->set_checkpoint_loader_queue(restore_work_queue.get());
 
   std::vector<mlrt::Value> args;
   args.resize(3);
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
index 9bf87142e775c5..323ee9fccf6503 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -25,7 +26,10 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
@@ -626,6 +630,42 @@ void CancelOp::Invoke() {
   }
 }
 
+struct ConstOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.constop";
+
+  absl::string_view tensor_proto() const {
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+
+  Context& context() { return execution_context().GetUserContext<Context>(); }
+
+  void Invoke();
+};
+
+void ConstOp::Invoke() {
+  tensorflow::TensorProto proto;
+  // TODO(b/330806453): Remove the std::string conversion once ParseFromString()
+  // in OSS accepets absl::string_view.
+  // NOLINTNEXTLINE: readability-redundant-string-conversions
+  if (!proto.ParseFromString(std::string(tensor_proto()))) {
+    execution_context().Fail(
+        absl::InternalError("Failed to parse const tensor proto"));
+    return;
+  }
+
+  tensorflow::Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    execution_context().Fail(
+        absl::InternalError("Failed to create tensor from tensor proto"));
+    return;
+  }
+
+  results()[0].Emplace<tensorflow::tfrt_stub::FallbackTensor>(
+      std::move(tensor));
+}
+
 struct CreateOp : mlrt::KernelFrame {
   using KernelFrame::KernelFrame;
 
@@ -985,6 +1025,7 @@ void RegisterTfMlrtKernels(mlrt::KernelRegistry& registry) {
   // TODO(chky,rohitju): These kernels should be unified with the corresponding
   // tfrt_fallback_sync kernels, e.g. tfrt_fallback_sync.executeop.
   registry.Register<CancelOp>();
+  registry.Register<ConstOp>();
   registry.Register<CreateOp>();
   registry.Register<CreateOp>("tfrt_fallback_sync.createop");
   registry.Register<ExecuteOp>();
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index 3acb0edd074e96..a7c5fb24092b17 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -76,6 +76,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:matmul_op",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
index f01f92b67c981f..f290637521a74a 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/errors.h"
@@ -161,16 +162,16 @@ TEST_F(RunHandlerThreadWorkQueueTest, RunningMixedTask) {
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, NameReturnsValidString) {
-  EXPECT_EQ(queue_->name(), "run_handler");
+  EXPECT_TRUE(absl::StrContains(pool_->name(), "RunHandlerThreadWorkQueue"));
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, GetParallelismLevelOk) {
-  EXPECT_EQ(queue_->GetParallelismLevel(),
+  EXPECT_EQ(pool_->GetParallelismLevel(),
             kNumComplementaryThreads + kNumMainThreads);
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, IsWorkerThreadOk) {
-  EXPECT_TRUE(queue_->IsInWorkerThread());
+  EXPECT_TRUE(pool_->IsInWorkerThread());
 }
 
 TEST_F(RunHandlerThreadWorkQueueTest, NoHandlerReturnsError) {
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
index fd839ef5709513..07ae2030434a2d 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h"
@@ -188,6 +189,7 @@ TEST_P(RunHandlerThreadPoolTest, EnqueueTask) {
   EXPECT_EQ(result, 1);
   tws.PopNonBlockingTask(0, true).f->f();
   EXPECT_EQ(result, 2);
+  EXPECT_TRUE(absl::StrContains(tws.ToString(), "traceme_id = 0"));
 }
 
 TEST_P(RunHandlerThreadPoolTest, FindTask) {
@@ -208,6 +210,9 @@ TEST_P(RunHandlerThreadPoolTest, FindTask) {
       tensorflow::Env::Default(), tensorflow::ThreadOptions(),
       "tf_run_handler_pool", &waiters_mu, &waiters);
 
+  EXPECT_EQ(run_handler_thread_pool.NumBlockingThreads(), 1);
+  EXPECT_EQ(run_handler_thread_pool.NumNonBlockingThreads(), 0);
+
   Eigen::MaxSizeVector<internal::ThreadWorkSource*> thread_work_sources(5);
   thread_work_sources.resize(5);
   for (int i = 0; i < 5; ++i) {
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 32b1da670b69fd..03976ee0048ca7 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:serialize_mlir_module_utils",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:upgrade_graph",
@@ -148,6 +149,7 @@ cc_library(
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 0d20c9da9a30e0..56cec2b5821ae3 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
@@ -26,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -39,10 +41,9 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
@@ -53,21 +54,19 @@ limitations under the License.
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
@@ -77,21 +76,15 @@ limitations under the License.
 #include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
-#include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_util.h"
 #include "tensorflow/core/tfrt/saved_model/utils/serialize_utils.h"
 #include "tensorflow/core/tfrt/stubs/model_config_stub.h"
-#include "tensorflow/core/tfrt/utils/error_util.h"
-#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
-#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value.h"  // from @tf_runtime
-#include "tfrt/host_context/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 #include "tfrt/host_context/function.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
@@ -111,12 +104,6 @@ auto* lazy_loading_count = monitoring::Counter<3>::New(
     "/tensorflow/tfrt/lazy_loading_count", "The total number of lazy loadings.",
     "model_name", "model_version", "use_graph_executor");
 
-auto* saved_model_graph_executor_mode = monitoring::Counter<3>::New(
-    "/tensorflow/tfrt/saved_model/graph_executor_mode",
-    "Record the total number of imported savedmodel using different graph "
-    "executor modes (BEF vs MLRT interpreter)",
-    "model_name", "model_version", "mode");
-
 auto* saved_model_import_time_seconds =
     tensorflow::monitoring::Gauge<int64_t, 1>::New(
         "/tensorflow/tfrt/saved_model/import_time",
@@ -609,25 +596,11 @@ SavedModelImpl::LoadSavedModel(Options options,
     tensorflow::tf_mlrt::RegisterTfMlrtBatchKernels(*kernel_registry);
 
     if (options.graph_execution_options.enable_mlrt) {
-      saved_model_graph_executor_mode
-          ->GetCell(
-              options.graph_execution_options.model_metadata.name(),
-              absl::StrCat(
-                  options.graph_execution_options.model_metadata.version()),
-              "mlrt")
-          ->IncrementBy(1);
       ASSIGN_OR_RETURN_IN_COMPILE(
           bytecode, tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
                         options.graph_execution_options.compile_options,
                         *fallback_state, mlir_module.get(), model_context));
     } else {
-      saved_model_graph_executor_mode
-          ->GetCell(
-              options.graph_execution_options.model_metadata.name(),
-              absl::StrCat(
-                  options.graph_execution_options.model_metadata.version()),
-              "bef")
-          ->IncrementBy(1);
       RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
           options.graph_execution_options.compile_options, mlir_module.get(),
           &bef, model_context, fallback_state.get()));
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index 09f1e7deaa8d10..71a181205f6a33 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -1,8 +1,8 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(":gen_saved_model.bzl", "gen_saved_model")
 
 package(
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
index 2cee765fa56ea4..ca4f42ce64da92 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -49,7 +49,8 @@ tsl::thread::ThreadPool& GetThreadPool() {
   return *thread_pool;
 }
 
-TEST(SavedModelIfrt, Basic) {
+// TODO(b/319045348): replace with a variableless model.
+TEST(SavedModelIfrt, DISABLED_Basic) {
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
       "tensorflow/core/tfrt/saved_model/tests/toy_v2");
 
diff --git a/tensorflow/core/tfrt/tfrt_session/BUILD b/tensorflow/core/tfrt/tfrt_session/BUILD
index b4157cacbd7785..ed2289b31cba02 100644
--- a/tensorflow/core/tfrt/tfrt_session/BUILD
+++ b/tensorflow/core/tfrt/tfrt_session/BUILD
@@ -128,6 +128,10 @@ tf_cc_shared_test(
 # tf_py_strict_test(
 #     name = "tfrt_session_python_test",
 #     srcs = ["tfrt_session_python_test.py"],
+#     exec_properties = select({
+#         "//tools/cpp:asan_build": {"cpp_link.mem": "16g"},
+#         "//conditions:default": None,
+#     }),
 #     python_version = "PY3",
 #     deps = [
 #         ":tfrt_session_py",
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 6d1843770a53c0..a664a14d58ffe1 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -464,6 +464,10 @@ class TfrtSession : public tensorflow::Session {
   Status ListDevices(std::vector<DeviceAttributes>* response) override {
     return errors::Unimplemented("TfrtSession::ListDevices is Unimplemented.");
   }
+  Status LocalDeviceManager(const DeviceMgr** output) override {
+    *output = &graph_executor_->fallback_state().device_manager();
+    return absl::OkStatus();
+  }
 
  private:
   tfrt::HostContext* GetHostContext() {
@@ -604,15 +608,6 @@ class TfrtSessionFactory::ThreadPoolManager {
         auto pool_index = it.index();
         auto num_threads = pool_options.num_threads();
 
-        // For the current use cases the first thread pool is always the default
-        // thread pool. We add this check here to verify the assumption. We can
-        // remove this check once the code stablizes, since it is semantically
-        // meaningful to use non-default thread pool as the first thread pool.
-        if (pool_index == 0 && num_threads != 0) {
-          return errors::InvalidArgument(
-              "The first thread pool must have num_threads = 0");
-        }
-
         if (num_threads != 0) {
           TF_ASSIGN_OR_RETURN(
               auto* thread_pool,
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index eade2efb96f75c..6299ce67840684 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,5 +1,6 @@
 # Description: Utilities for TPU Operations
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_libtpu",
@@ -7,7 +8,6 @@ load(
     "tf_cc_test",
 )
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index 0140362078487d..df0dfe1d1010e6 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -1,10 +1,10 @@
 # Contains graph rewrites for TPU runtimes and optimizations.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
 )
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index d13eb1dbdfc901..9008591c2800db 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -650,28 +650,6 @@ Status GetStepMarkerLocation(const Node& replicate_node,
   return absl::OkStatus();
 }
 
-// Extracts a map of dimension and number of splits for tiled input from xla
-// sharding attribute.
-Status GetDimensionIndicesAndNumSplitsFromSharding(
-    const xla::OpSharding& sharding, std::map<int, int>* split_dimension_map) {
-  int64_t tensor_tile_rank = sharding.tile_assignment_dimensions_size();
-  if (sharding.replicate_on_last_tile_dim()) {
-    tensor_tile_rank--;
-  }
-  for (int dim_index = 0; dim_index < tensor_tile_rank; dim_index++) {
-    if (sharding.tile_assignment_dimensions(dim_index) > 1) {
-      split_dimension_map->emplace(
-          dim_index, sharding.tile_assignment_dimensions(dim_index));
-    }
-  }
-
-  if (split_dimension_map->empty()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Arg has unnecessary tiled sharding: ", sharding.DebugString()));
-  }
-  return absl::OkStatus();
-}
-
 // Updates contents of the function with `function_name` in function library
 // definition `flib_def` to `new_graph`. This is required when graph
 // transformation happens inside a function call body.
@@ -861,9 +839,8 @@ StatusOr<ShardedInputInfo> CreateOrGetSplitNodesForInputSharding(
   }
   // Maps input dimension and number of splits with which the
   // dimension sharded.
-  std::map<int, int> split_dimension_map;
-  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
-      sharding, &split_dimension_map));
+  TF_ASSIGN_OR_RETURN(auto split_dimension_map,
+                      GetDimensionIndicesAndNumSplitsFromSharding(sharding));
   TF_RET_CHECK(!split_dimension_map.empty())
       << "Unnecessary sharding attribute found.";
 
@@ -1280,9 +1257,8 @@ StatusOr<Node*> CreateConcatNodesForRetval(
     const PartialTensorShape& inferred_shape, int replica_id,
     const std::vector<NodeOut>& orig_inputs, Graph* graph,
     absl::string_view device) {
-  std::map<int, int> split_dimension_map;
-  TF_RETURN_IF_ERROR(GetDimensionIndicesAndNumSplitsFromSharding(
-      sharding, &split_dimension_map));
+  TF_ASSIGN_OR_RETURN(auto split_dimension_map,
+                      GetDimensionIndicesAndNumSplitsFromSharding(sharding));
   std::vector<NodeOut> inputs_to_sharded_retval = orig_inputs;
   bool has_paddings = false;
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 1a3deec7b09ed5..785367ff5cefdc 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -70,17 +70,22 @@ Status ValidateInputs(const Tensor& indices_or_row_splits, const Tensor& values,
           "Weights input should have dimension 0 or 1. But got dimension ",
           weights.dims(), "."));
   }
-  if (indices_or_row_splits.NumElements() == 0) {
-    // Dense tensor.
-    if (values.NumElements() != sample_count) {
+  // The indices_or_row_splits input for dense tensor is strictly 0 element
+  // with dimension 1.
+  if (indices_or_row_splits.NumElements() == 0 &&
+      indices_or_row_splits.dims() == 1) {
+    // Dense tensor with 0 element is also valid.
+    if (values.NumElements() != 0 && values.NumElements() != sample_count) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Dense tensor input should have values elements number the same as "
           "the sample count. But got ",
           values.NumElements(), " elements for values and sample count as ",
           sample_count, "."));
     }
+    // 0 element indices with dimension as 2 is also valid for empty sparse
+    // tensor.
   } else if (indices_or_row_splits.dims() == 2 &&
-             indices_or_row_splits.NumElements() > 0) {
+             indices_or_row_splits.NumElements() >= 0) {
     // TODO(pineapplejuice233): Add checking logic for sparse tensor input.
   } else if (indices_or_row_splits.dims() == 1 &&
              indices_or_row_splits.NumElements() > 0) {
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
index c6a7bcaa3b9780..9202b40dfc9370 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_ops.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/tpu/tpu_embedding_spmd_sharding_utils.h"
@@ -49,6 +50,297 @@ using xla::LiteralUtil;
 
 namespace {
 
+void CompileRecvTPUEmbeddingActivations(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const tensorflow::tpu::TPUEmbeddingConfiguration& tpu_embedding_config,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
+  TpuEmbeddingEngine_RecvActivationsComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  StatusHelper status;
+  params.status = status.c_status;
+  auto builder = ctx->builder();
+  OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  XLA_Shape c_shape;
+  ApiConverter::ToC(shape, &c_shape);
+  auto c_shape_cleanup =
+      absl::MakeCleanup([&c_shape] { ApiConverter::Destroy(&c_shape); });
+  params.deduplication_data_shape = &c_shape;
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_RecvActivationsComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+  auto final_activations =
+      xla::Call(builder, xla_computation, {deduplication_data});
+
+  // Ensure that the number of outputs is the same as the number of user
+  // tables.
+  const int32_t output_count =
+      (tpu_embedding_config.feature_descriptor_size() == 0)
+          ? tpu_embedding_config.table_descriptor_size()
+          : tpu_embedding_config.feature_descriptor_size();
+  OP_REQUIRES(ctx, ctx->num_outputs() == output_count,
+              errors::InvalidArgument(
+                  "Kernel has %d outputs but configuration expects %d outputs.",
+                  ctx->num_outputs(), output_count));
+
+  for (int32_t output_id = 0; output_id < output_count; ++output_id) {
+    ctx->SetOutput(output_id,
+                   xla::GetTupleElement(final_activations, output_id));
+  }
+}
+
+void CompileRecvTPUEmbeddingDeduplicationData(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params params;
+
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputationFn(
+          &params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+
+  const xla::XlaOp deduplication_data =
+      xla::Call(ctx->builder(), xla_computation, {});
+
+  // Ensure that the number of outputs is equal to 1 (for deduplication data).
+  OP_REQUIRES(ctx, ctx->num_outputs() == 1,
+              errors::InvalidArgument(
+                  "Kernel has %d outputs but configuration expects 1 output.",
+                  ctx->num_outputs()));
+
+  ctx->SetOutput(0, deduplication_data);
+}
+
+void CompileSendTPUEmbeddingGradients(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  std::vector<xla::XlaOp> gradients;
+  std::vector<TensorShape> tf_gradient_shapes;
+  OP_REQUIRES_OK(ctx,
+                 ctx->InputList("gradients", &gradients, &tf_gradient_shapes));
+  std::vector<xla::Shape> gradient_shapes;
+  auto builder = ctx->builder();
+  gradient_shapes.reserve(gradients.size());
+  for (xla::XlaOp op : gradients) {
+    // Gradient layout information is added by XLA, so we can just create
+    // default layout information.
+    xla::Shape gradient_shape = builder->GetShape(op).value();
+    xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
+    gradient_shapes.push_back(gradient_shape);
+  }
+
+  std::vector<xla::XlaOp> learning_rates;
+  std::vector<TensorShape> tf_learning_rate_shapes;
+  OP_REQUIRES_OK(ctx, ctx->InputList("learning_rates", &learning_rates,
+                                     &tf_learning_rate_shapes));
+  std::vector<xla::Shape> learning_rate_shapes;
+  learning_rate_shapes.reserve(learning_rates.size());
+  for (xla::XlaOp op : learning_rates) {
+    learning_rate_shapes.push_back(builder->GetShape(op).value());
+  }
+
+  xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
+
+  TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+  OP_REQUIRES_VALUE(auto deduplication_shape, ctx,
+                    builder->GetShape(deduplication_data));
+  XLA_Shape gradient_tuple_c_shape;
+  params.gradient_tuple_shape = &gradient_tuple_c_shape;
+  ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(gradient_shapes),
+                    &gradient_tuple_c_shape);
+  XLA_Shape learning_rate_tuple_c_shape;
+  params.learning_rate_tuple_shape = &learning_rate_tuple_c_shape;
+  ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(learning_rate_shapes),
+                    &learning_rate_tuple_c_shape);
+  XLA_Shape deduplication_c_shape;
+  params.deduplication_data_shape = &deduplication_c_shape;
+  ApiConverter::ToC(deduplication_shape, &deduplication_c_shape);
+
+  auto c_shape_cleanup =
+      absl::MakeCleanup([&gradient_tuple_c_shape, &learning_rate_tuple_c_shape,
+                         &deduplication_c_shape] {
+        ApiConverter::Destroy(&gradient_tuple_c_shape);
+        ApiConverter::Destroy(&learning_rate_tuple_c_shape);
+        ApiConverter::Destroy(&deduplication_c_shape);
+      });
+  params.num_inputs = ctx->num_inputs();
+
+  TpuSerializedProto op_sharding_proto_serialized;
+  if (ctx->builder()->sharding().has_value()) {
+    stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
+                                         &op_sharding_proto_serialized);
+    params.op_sharding = &op_sharding_proto_serialized;
+  } else {
+    params.op_sharding = nullptr;
+  }
+  auto op_sharding_cleanup = absl::MakeCleanup([&] {
+    if (params.op_sharding) {
+      StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
+    }
+  });
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+
+  xla::Call(builder, xla_computation,
+            {xla::Tuple(builder, gradients),
+             xla::Tuple(builder, learning_rates), deduplication_data});
+}
+
+void CompileComputeDedupDataSize(XlaOpKernelContext* ctx,
+                                 const std::string& config_string,
+                                 const std::string& embedding_partitions_string,
+                                 const std::string& hbm_buffers_config_string,
+                                 const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_DedupDataSizeComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+  int num_elements = -1;
+  params.num_elements = &num_elements;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_DedupDataSizeComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto output = xla::ConstantLiteral(
+      ctx->builder(), LiteralUtil::CreateR0<int32_t>(num_elements));
+  ctx->SetOutput(0, output);
+}
+
+void CompileComputeDedupDataTupleMask(
+    XlaOpKernelContext* ctx, const std::string& config_string,
+    const std::string& embedding_partitions_string,
+    const std::string& hbm_buffers_config_string,
+    const std::string& tpu_topology_string) {
+  TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params params;
+  params.tpu_embedding_config.bytes = config_string.c_str();
+  params.tpu_embedding_config.size = config_string.size();
+  params.embedding_partitions.bytes = embedding_partitions_string.c_str();
+  params.embedding_partitions.size = embedding_partitions_string.size();
+  params.hbm_buffers_config.bytes = hbm_buffers_config_string.c_str();
+  params.hbm_buffers_config.size = hbm_buffers_config_string.size();
+  params.tpu_topology.bytes = tpu_topology_string.c_str();
+  params.tpu_topology.size = tpu_topology_string.size();
+
+  TpuSerializedProto xla_computation_serialized;
+  auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
+    StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
+  });
+
+  params.xla_computation = &xla_computation_serialized;
+  StatusHelper status;
+  params.status = status.c_status;
+
+  stream_executor::tpu::OpsApiFn()
+      ->TpuEmbeddingEngine_DedupDataTupleMaskComputationFn(&params);
+  OP_REQUIRES_OK(ctx, status.status());
+
+  auto xla_computation =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          xla_computation_serialized);
+  const xla::XlaOp deduplication_data_tuple_mask =
+      xla::Call(ctx->builder(), xla_computation, {});
+  ctx->SetOutput(0, deduplication_data_tuple_mask);
+}
+
 // This TensorFlow op receives a batch of activations from the
 // TpuEmbeddingEngine.
 class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
@@ -70,66 +362,8 @@ class RecvTPUEmbeddingActivationsOp : public XlaOpKernel {
         ctx, ctx->num_inputs() == 1,
         errors::Internal("Kernel has ", ctx->num_inputs(),
                          " inputs but configuration expects one input"));
-
-    xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
-
-    TpuEmbeddingEngine_RecvActivationsComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    StatusHelper status;
-    params.status = status.c_status;
-    auto builder = ctx->builder();
-    OP_REQUIRES_VALUE(auto shape, ctx, builder->GetShape(deduplication_data));
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    XLA_Shape c_shape;
-    ApiConverter::ToC(shape, &c_shape);
-    auto c_shape_cleanup =
-        absl::MakeCleanup([&c_shape] { ApiConverter::Destroy(&c_shape); });
-    params.deduplication_data_shape = &c_shape;
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_RecvActivationsComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-    auto final_activations =
-        xla::Call(builder, xla_computation, {deduplication_data});
-
-    // Ensure that the number of outputs is the same as the number of user
-    // tables.
-    const int32 output_count =
-        (tpu_embedding_config_.feature_descriptor_size() == 0)
-            ? tpu_embedding_config_.table_descriptor_size()
-            : tpu_embedding_config_.feature_descriptor_size();
-    OP_REQUIRES(
-        ctx, ctx->num_outputs() == output_count,
-        errors::InvalidArgument(
-            "Kernel has %d outputs but configuration expects %d outputs.",
-            ctx->num_outputs(), output_count));
-
-    for (int32 output_id = 0; output_id < output_count; ++output_id) {
-      ctx->SetOutput(output_id,
-                     xla::GetTupleElement(final_activations, output_id));
-    }
+    CompileRecvTPUEmbeddingActivations(ctx, config_string_,
+                                       tpu_embedding_config_, "", "", "");
   }
 
  private:
@@ -164,52 +398,8 @@ class RecvTPUEmbeddingDeduplicationDataOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile RecvTPUEmbeddingDeduplicationDataOp";
 
-    TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputation_Params
-        params;
-
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_RecvTPUEmbeddingDeduplicationDataComputationFn(
-            &params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-
-    const xla::XlaOp deduplication_data =
-        xla::Call(ctx->builder(), xla_computation, {});
+    CompileRecvTPUEmbeddingDeduplicationData(ctx, config_string_, "", "", "");
 
-    // Ensure that the number of outputs is equal to 1 (for deduplication data).
-    OP_REQUIRES(ctx, ctx->num_outputs() == 1,
-                errors::InvalidArgument(
-                    "Kernel has %d outputs but configuration expects 1 output.",
-                    ctx->num_outputs()));
-
-    ctx->SetOutput(0, deduplication_data);
     VLOG(1) << "Compile RecvTPUDeduplicationDataOp done";
   }
 
@@ -246,91 +436,7 @@ class SendTPUEmbeddingGradientsOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile SendTPUEmbeddingGradientsOp";
 
-    std::vector<xla::XlaOp> gradients;
-    std::vector<TensorShape> tf_gradient_shapes;
-    OP_REQUIRES_OK(
-        ctx, ctx->InputList("gradients", &gradients, &tf_gradient_shapes));
-    std::vector<xla::Shape> gradient_shapes;
-    auto builder = ctx->builder();
-    gradient_shapes.reserve(gradients.size());
-    for (xla::XlaOp op : gradients) {
-      // Gradient layout information is added by XLA, so we can just create
-      // default layout information.
-      xla::Shape gradient_shape = builder->GetShape(op).value();
-      xla::LayoutUtil::SetToDefaultLayout(&gradient_shape);
-      gradient_shapes.push_back(gradient_shape);
-    }
-
-    std::vector<xla::XlaOp> learning_rates;
-    std::vector<TensorShape> tf_learning_rate_shapes;
-    OP_REQUIRES_OK(ctx, ctx->InputList("learning_rates", &learning_rates,
-                                       &tf_learning_rate_shapes));
-    std::vector<xla::Shape> learning_rate_shapes;
-    learning_rate_shapes.reserve(learning_rates.size());
-    for (xla::XlaOp op : learning_rates) {
-      learning_rate_shapes.push_back(builder->GetShape(op).value());
-    }
-
-    xla::XlaOp deduplication_data = ctx->Input("deduplication_data");
-
-    TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-    OP_REQUIRES_VALUE(auto deduplication_shape, ctx,
-                      builder->GetShape(deduplication_data));
-    XLA_Shape gradient_tuple_c_shape;
-    params.gradient_tuple_shape = &gradient_tuple_c_shape;
-    ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(gradient_shapes),
-                      &gradient_tuple_c_shape);
-    XLA_Shape learning_rate_tuple_c_shape;
-    params.learning_rate_tuple_shape = &learning_rate_tuple_c_shape;
-    ApiConverter::ToC(xla::ShapeUtil::MakeTupleShape(learning_rate_shapes),
-                      &learning_rate_tuple_c_shape);
-    XLA_Shape deduplication_c_shape;
-    params.deduplication_data_shape = &deduplication_c_shape;
-    ApiConverter::ToC(deduplication_shape, &deduplication_c_shape);
-
-    auto c_shape_cleanup = absl::MakeCleanup([&gradient_tuple_c_shape,
-                                              &learning_rate_tuple_c_shape,
-                                              &deduplication_c_shape] {
-      ApiConverter::Destroy(&gradient_tuple_c_shape);
-      ApiConverter::Destroy(&learning_rate_tuple_c_shape);
-      ApiConverter::Destroy(&deduplication_c_shape);
-    });
-    params.num_inputs = ctx->num_inputs();
-
-    TpuSerializedProto op_sharding_proto_serialized;
-    if (ctx->builder()->sharding().has_value()) {
-      stream_executor::tpu::SerializeProto(ctx->builder()->sharding().value(),
-                                           &op_sharding_proto_serialized);
-      params.op_sharding = &op_sharding_proto_serialized;
-    } else {
-      params.op_sharding = nullptr;
-    }
-    auto op_sharding_cleanup = absl::MakeCleanup([&] {
-      if (params.op_sharding) {
-        StreamExecutor_Tpu_FreeSerializedProto(&op_sharding_proto_serialized);
-      }
-    });
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-
-    xla::Call(builder, xla_computation,
-              {xla::Tuple(builder, gradients),
-               xla::Tuple(builder, learning_rates), deduplication_data});
+    CompileSendTPUEmbeddingGradients(ctx, config_string_, "", "", "");
 
     VLOG(1) << "Compile SendTPUEmbeddingGradientsOp done";
   }
@@ -719,21 +825,7 @@ class ComputeDedupDataSizeOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile ComputeDedupDataSizeOp";
 
-    TpuEmbeddingEngine_DedupDataSizeComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-    int num_elements = -1;
-    params.num_elements = &num_elements;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_DedupDataSizeComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto output = xla::ConstantLiteral(
-        ctx->builder(), LiteralUtil::CreateR0<int32_t>(num_elements));
-    ctx->SetOutput(0, output);
+    CompileComputeDedupDataSize(ctx, config_string_, "", "", "");
 
     VLOG(1) << "Compile ComputeDedupDataSizeOp done";
   }
@@ -765,29 +857,8 @@ class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     VLOG(1) << "Compile ComputeDedupDataTupleMaskOp";
 
-    TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params params;
-    params.tpu_embedding_config.bytes = config_string_.c_str();
-    params.tpu_embedding_config.size = config_string_.size();
-
-    TpuSerializedProto xla_computation_serialized;
-    auto proto_cleanup = absl::MakeCleanup([&xla_computation_serialized] {
-      StreamExecutor_Tpu_FreeSerializedProto(&xla_computation_serialized);
-    });
-
-    params.xla_computation = &xla_computation_serialized;
-    StatusHelper status;
-    params.status = status.c_status;
-
-    stream_executor::tpu::OpsApiFn()
-        ->TpuEmbeddingEngine_DedupDataTupleMaskComputationFn(&params);
-    OP_REQUIRES_OK(ctx, status.status());
-
-    auto xla_computation =
-        stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
-            xla_computation_serialized);
-    const xla::XlaOp deduplication_data_tuple_mask =
-        xla::Call(ctx->builder(), xla_computation, {});
-    ctx->SetOutput(0, deduplication_data_tuple_mask);
+    CompileComputeDedupDataTupleMask(ctx, config_string_, "", "", "");
+
     VLOG(1) << "Compile ComputeDedupDataTupleMaskOp done";
   }
 
@@ -802,5 +873,230 @@ class ComputeDedupDataTupleMaskOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ComputeDedupDataTupleMask").AllowVariantTypes(),
                 ComputeDedupDataTupleMaskOp);
 
+// This Op has the same functionality as `XlaRecvTPUEmbeddingActivations`, but
+// it accepts `embedding_partitions` and `hbm_buffers_config` (which can be
+// obtained from `FinalizeTPUEmbeddingV2`). This is meaningful for use cases
+// where the kernel runs in a different address space from where
+// `embedding_partitions` and `hbm_buffers_config` are stored.
+// The same principle applies to all the other V2 Ops here.
+class RecvTPUEmbeddingActivationsV2Op : public XlaOpKernel {
+ public:
+  explicit RecvTPUEmbeddingActivationsV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+
+    OP_REQUIRES(
+        ctx, tpu_embedding_config_.ParseFromString(config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~RecvTPUEmbeddingActivationsV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    OP_REQUIRES(
+        ctx, ctx->num_inputs() == 1,
+        errors::Internal("Kernel has ", ctx->num_inputs(),
+                         " inputs but configuration expects one input"));
+
+    CompileRecvTPUEmbeddingActivations(
+        ctx, config_string_, tpu_embedding_config_,
+        embedding_partitions_string_, hbm_buffers_config_string_,
+        tpu_topology_string_);
+  }
+
+ private:
+  tensorflow::tpu::TPUEmbeddingConfiguration tpu_embedding_config_;
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  RecvTPUEmbeddingActivationsV2Op(const RecvTPUEmbeddingActivationsV2Op&) =
+      delete;
+  void operator=(const RecvTPUEmbeddingActivationsV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("XlaRecvTPUEmbeddingActivationsV2").AllowVariantTypes(),
+                RecvTPUEmbeddingActivationsV2Op);
+
+class RecvTPUEmbeddingDeduplicationDataV2Op : public XlaOpKernel {
+ public:
+  explicit RecvTPUEmbeddingDeduplicationDataV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~RecvTPUEmbeddingDeduplicationDataV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile RecvTPUEmbeddingDeduplicationDataV2Op";
+
+    CompileRecvTPUEmbeddingDeduplicationData(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile RecvTPUDeduplicationDataV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  RecvTPUEmbeddingDeduplicationDataV2Op(
+      const RecvTPUEmbeddingDeduplicationDataV2Op&) = delete;
+  void operator=(const RecvTPUEmbeddingDeduplicationDataV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(
+    Name("XlaRecvTPUEmbeddingDeduplicationDataV2").AllowVariantTypes(),
+    RecvTPUEmbeddingDeduplicationDataV2Op);
+
+class SendTPUEmbeddingGradientsV2Op : public XlaOpKernel {
+ public:
+  explicit SendTPUEmbeddingGradientsV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  ~SendTPUEmbeddingGradientsV2Op() override = default;
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile SendTPUEmbeddingGradientsV2Op";
+
+    CompileSendTPUEmbeddingGradients(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile SendTPUEmbeddingGradientsV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  SendTPUEmbeddingGradientsV2Op(const SendTPUEmbeddingGradientsV2Op&) = delete;
+  void operator=(const SendTPUEmbeddingGradientsV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("XlaSendTPUEmbeddingGradientsV2").AllowVariantTypes(),
+                SendTPUEmbeddingGradientsV2Op);
+
+class ComputeDedupDataSizeV2Op : public XlaOpKernel {
+ public:
+  explicit ComputeDedupDataSizeV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        absl::InvalidArgumentError("Failed to parse TPUEmbeddingConfiguration "
+                                   "proto from config attr."));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile ComputeDedupDataSizeV2Op";
+
+    CompileComputeDedupDataSize(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile ComputeDedupDataSizeV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  ComputeDedupDataSizeV2Op(const ComputeDedupDataSizeV2Op&) = delete;
+  void operator=(const ComputeDedupDataSizeV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("ComputeDedupDataSizeV2"), ComputeDedupDataSizeV2Op);
+
+class ComputeDedupDataTupleMaskV2Op : public XlaOpKernel {
+ public:
+  explicit ComputeDedupDataTupleMaskV2Op(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("embedding_partitions",
+                                     &embedding_partitions_string_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("hbm_buffers_config", &hbm_buffers_config_string_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("tpu_topology", &tpu_topology_string_));
+    OP_REQUIRES(
+        ctx,
+        tensorflow::tpu::TPUEmbeddingConfiguration().ParseFromString(
+            config_string_),
+        errors::InvalidArgument("Failed to parse TPUEmbeddingConfiguration "
+                                "proto from config attr"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    VLOG(1) << "Compile ComputeDedupDataTupleMaskV2Op";
+
+    CompileComputeDedupDataTupleMask(
+        ctx, config_string_, embedding_partitions_string_,
+        hbm_buffers_config_string_, tpu_topology_string_);
+
+    VLOG(1) << "Compile ComputeDedupDataTupleMaskV2Op done";
+  }
+
+ private:
+  // TPU Embedding config string.
+  std::string config_string_;
+  std::string embedding_partitions_string_;
+  std::string hbm_buffers_config_string_;
+  std::string tpu_topology_string_;
+
+  ComputeDedupDataTupleMaskV2Op(const ComputeDedupDataTupleMaskV2Op&) = delete;
+  void operator=(const ComputeDedupDataTupleMaskV2Op&) = delete;
+};
+
+REGISTER_XLA_OP(Name("ComputeDedupDataTupleMaskV2").AllowVariantTypes(),
+                ComputeDedupDataTupleMaskV2Op);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
index e973552d55e7f4..5d15a34bb14ba5 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
@@ -567,4 +567,133 @@ REGISTER_OP("ComputeDedupDataTupleMask")
       return absl::OkStatus();
     });
 
+REGISTER_OP("XlaRecvTPUEmbeddingActivationsV2")
+    .Input("deduplication_data: variant")
+    .Output("outputs: num_tables * float32")
+    .Attr("num_tables: int >= 1")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      int num_tables;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_tables", &num_tables));
+      if (c->num_outputs() != num_tables) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Number of outputs: %d of the "
+                            "XlaRecvTPUEmbeddingActivationsV2 node "
+                            "does not match the num_tables attribute: %d.",
+                            c->num_outputs(), num_tables));
+      }
+      std::string config_string;
+      TF_RETURN_IF_ERROR(c->GetAttr("config", &config_string));
+      tpu::TPUEmbeddingConfiguration config;
+      if (!config.ParseFromString(config_string)) {
+        return absl::InvalidArgumentError(
+            "Malformed config attribute in the "
+            "XlaRecvTPUEmbeddingActivationsV2 "
+            "node.");
+      }
+      std::string embedding_partitions_string;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("embedding_partitions", &embedding_partitions_string));
+      std::string hbm_buffers_config_string;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("hbm_buffers_config", &hbm_buffers_config_string));
+      std::string tpu_topology_string;
+      TF_RETURN_IF_ERROR(c->GetAttr("tpu_topology", &tpu_topology_string));
+      std::vector<TensorShapeProto> output_shapes;
+      TF_RETURN_IF_ERROR(ComputeOutputTensorShapes(config, &output_shapes));
+      if (c->num_outputs() != output_shapes.size()) {
+        return absl::InvalidArgumentError(absl::StrFormat(
+            "Number of outputs: %d of the XlaRecvTPUEmbeddingActivationsV2 "
+            "node "
+            "does not match the number of tables or features in the TPU "
+            "embedding config: %d.",
+            c->num_outputs(), output_shapes.size()));
+      }
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        shape_inference::ShapeHandle output_shape;
+        TF_RETURN_IF_ERROR(
+            c->MakeShapeFromShapeProto(output_shapes[i], &output_shape));
+        c->set_output(i, output_shape);
+      }
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("XlaRecvTPUEmbeddingDeduplicationDataV2")
+    .Output("output: variant")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("XlaSendTPUEmbeddingGradientsV2")
+    .Input("gradients: NumTables * float32")
+    .Input("learning_rates: NumLearningRateTags * float32")
+    .Input("deduplication_data: variant")
+    .Attr("NumTables: int >= 1")
+    .Attr("NumLearningRateTags: int >= 0 = 0")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) -> absl::Status {
+      int learning_rate_tag_count;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("NumLearningRateTags", &learning_rate_tag_count));
+      std::vector<shape_inference::ShapeHandle> learning_rates;
+      TF_RETURN_IF_ERROR(c->input("learning_rates", &learning_rates));
+      for (int i = 0; i < learning_rate_tag_count; ++i) {
+        // Verify that each learning_rates element is scalar
+        shape_inference::ShapeHandle learning_rates_shape;
+        TF_RETURN_IF_ERROR(
+            c->WithRank(learning_rates[i], 0, &learning_rates_shape));
+      }
+
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("ComputeDedupDataSizeV2")
+    .Output("num_elements: int32")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("ComputeDedupDataTupleMaskV2")
+    .Output("output_shape: int32")
+    .Attr("config: string")
+    .Attr("embedding_partitions: string")
+    .Attr("hbm_buffers_config: string")
+    .Attr("tpu_topology: string")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(2));
+      return absl::OkStatus();
+    });
+
+REGISTER_OP("FinalizeTPUEmbeddingV2")
+    .Input("common_config: string")
+    .Input("memory_config: string")
+    .Output("embedding_partitions: string")
+    .Output("hbm_buffers_config: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) -> absl::Status {
+      // Validate that all the inputs are compatible with the correct
+      // vector shape.
+      TF_RET_CHECK(c->num_inputs() == 2);
+      ShapeHandle input(c->Scalar());
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), input, &input));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), input, &input));
+      TF_RET_CHECK(c->num_outputs() == 2);
+      return absl::OkStatus();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/transforms/constant_folding/BUILD b/tensorflow/core/transforms/constant_folding/BUILD
index e64e9d868f2677..ba1bc56b1749d2 100644
--- a/tensorflow/core/transforms/constant_folding/BUILD
+++ b/tensorflow/core/transforms/constant_folding/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index a75461d412b23b..38b09b8bf601cb 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -1,6 +1,6 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -48,6 +48,7 @@ cc_library(
         "@llvm-project//mlir:PDLInterpDialect",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/core/user_ops/BUILD b/tensorflow/core/user_ops/BUILD
index abb6310822893d..7b562a05d12679 100644
--- a/tensorflow/core/user_ops/BUILD
+++ b/tensorflow/core/user_ops/BUILD
@@ -1,13 +1,13 @@
 # User ops.
 
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index aabf50cd39109c..c2b913a17a6a8f 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -1,4 +1,8 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+)
 load(
     "//tensorflow:tensorflow.bzl",
     "check_deps",
@@ -24,10 +28,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-)
 
 default_package_visibility = [
     "//tensorflow/core:__subpackages__",
@@ -66,7 +66,7 @@ filegroup(
         "padding.h",
         "tensor_format.cc",
         "tensor_format.h",
-        "@local_tsl//tsl/util:mobile_srcs_no_runtime",
+        "@local_xla//xla/tsl/util:mobile_srcs_no_runtime",
     ],
 )
 
@@ -132,7 +132,7 @@ filegroup(
         "work_sharder.h",
         "//tensorflow/core/config:mobile_srcs_only_runtime",
         "//tensorflow/core/util/quantization:mobile_srcs_only_runtime",
-        "@local_tsl//tsl/util:mobile_srcs_only_runtime",
+        "@local_xla//xla/tsl/util:mobile_srcs_only_runtime",
     ],
 )
 
@@ -189,7 +189,7 @@ filegroup(
         "util.h",
         "work_sharder.h",
         "xla_config_registry.h",
-        "@local_tsl//tsl/util:framework_internal_private_hdrs",
+        "@local_xla//xla/tsl/util:framework_internal_private_hdrs",
     ],
 )
 
@@ -231,7 +231,7 @@ filegroup(
         "util.cc",
         "work_sharder.cc",
         "xla_config_registry.cc",
-        "@local_tsl//tsl/util:framework_internal_impl_srcs",
+        "@local_xla//xla/tsl/util:framework_internal_impl_srcs",
     ],
 )
 
@@ -240,7 +240,7 @@ filegroup(
     srcs = [
         "env_var.h",
         "use_cudnn.h",
-        "@local_tsl//tsl/util:lib_internal_public_hdrs",
+        "@local_xla//xla/tsl/util:lib_internal_public_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -286,7 +286,7 @@ filegroup(
     testonly = 1,
     srcs = [
         "reporter.h",
-        "@local_tsl//tsl/util:test_hdrs",
+        "@local_xla//xla/tsl/util:test_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -297,7 +297,7 @@ filegroup(
         "mkl_heuristics.h",
         "mkl_util.h",
         "onednn_env_vars.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "@local_xla//xla/tsl/util:onednn_util_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -316,7 +316,7 @@ filegroup(
     testonly = 1,
     srcs = [
         "reporter.h",
-        "@local_tsl//tsl/util:android_test_hdrs",
+        "@local_xla//xla/tsl/util:android_test_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -326,7 +326,7 @@ filegroup(
     testonly = 1,
     srcs = [
         ":android_test_hdrs",
-        "@local_tsl//tsl/util:android_test_srcs",
+        "@local_xla//xla/tsl/util:android_test_srcs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -367,7 +367,7 @@ filegroup(
         "use_cudnn.h",
         "util.h",
         "work_sharder.h",
-        "@local_tsl//tsl/util:framework_srcs",
+        "@local_xla//xla/tsl/util:framework_srcs",
     ],
 )
 
@@ -422,7 +422,7 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
@@ -510,7 +510,7 @@ cc_library(
         "//tensorflow/core/platform:mutex",
         "//tensorflow/core/platform:str_util",
         "//tensorflow/core/platform:types",
-        "@local_tsl//tsl/util:reporter",
+        "@local_xla//xla/tsl/util:reporter",
     ],
 )
 
@@ -641,7 +641,7 @@ cc_library(
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:types",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
@@ -666,7 +666,7 @@ cc_library(
     deps = [
         ":env_var",
         "//tensorflow/core/platform:mutex",
-        "@local_tsl//tsl/util:determinism",
+        "@local_xla//xla/tsl/util:determinism",
     ],
     alwayslink = 1,
 )
@@ -675,7 +675,7 @@ filegroup(
     name = "determinism_hdr",
     srcs = [
         "determinism.h",
-        "@local_tsl//tsl/util:determinism_hdr",
+        "@local_xla//xla/tsl/util:determinism_hdr",
     ],
     compatible_with = get_compatible_with_portable(),
     visibility = ["//tensorflow:__subpackages__"],
@@ -688,7 +688,7 @@ cc_library(
     # TODO(b/298501506): narrow this in a way that won't break TFRT
     visibility = ["//visibility:public"],
     deps = [
-        "@local_tsl//tsl/util:determinism_hdr_lib",
+        "@local_xla//xla/tsl/util:determinism_hdr_lib",
     ],
 )
 
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index cc8ca1b8f119b4..ebc58f7ee476ab 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tensorflow {
 using tsl::Flag;   // NOLINT
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 93ddc91b361440..46c561e0ff0402 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -2,9 +2,9 @@
 # is a type of seq2seq loss.  The libraries in this directory
 # implement the CTC loss and a number of CTC decoders.
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/determinism.h b/tensorflow/core/util/determinism.h
index e42fb71d42b0bc..136534ea828570 100644
--- a/tensorflow/core/util/determinism.h
+++ b/tensorflow/core/util/determinism.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DETERMINISM_H_
 #define TENSORFLOW_CORE_UTIL_DETERMINISM_H_
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 20b1f21786b2f0..28b5b0f1b6e764 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_DEVICE_NAME_UTILS_H_
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
diff --git a/tensorflow/core/util/env_var.h b/tensorflow/core/util/env_var.h
index fac0e2373ad145..faad61533d648a 100644
--- a/tensorflow/core/util/env_var.h
+++ b/tensorflow/core/util/env_var.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 #define TENSORFLOW_CORE_UTIL_ENV_VAR_H_
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7d6b1d76d78b2a..a6164cc0264518 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,7 +42,7 @@ limitations under the License.
 #if defined(DNNL_AARCH64_USE_ACL) && defined(ENABLE_ONEDNN_OPENMP)
 #include "tensorflow/core/platform/mutex.h"
 #endif
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 
 using dnnl::engine;
 using dnnl::memory;
diff --git a/tensorflow/core/util/onednn_env_vars.cc b/tensorflow/core/util/onednn_env_vars.cc
index 1b73ef8e862fd7..e8d0e293d4ccf9 100644
--- a/tensorflow/core/util/onednn_env_vars.cc
+++ b/tensorflow/core/util/onednn_env_vars.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/util/onednn_env_vars.h b/tensorflow/core/util/onednn_env_vars.h
index e2cb27ccfc8115..d7debd22976d02 100644
--- a/tensorflow/core/util/onednn_env_vars.h
+++ b/tensorflow/core/util/onednn_env_vars.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 41e7b7e29d762b..1d84749d6bf523 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -72,7 +72,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "@local_xla//xla/tsl/util/proto:proto_utils",
     ],
 )
 
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 43f8c918299c65..f0347a84cbe429 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -17,10 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tsl/util/proto/proto_utils.h"
 
 namespace tensorflow {
 namespace proto_utils {
diff --git a/tensorflow/core/util/quantization/BUILD b/tensorflow/core/util/quantization/BUILD
index 532f43044f8ca5..51314b42cd494d 100644
--- a/tensorflow/core/util/quantization/BUILD
+++ b/tensorflow/core/util/quantization/BUILD
@@ -1,5 +1,10 @@
 # Description: Utils for Tensorflow quantization.
 
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
@@ -8,11 +13,6 @@ load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index f36b7c72bfb275..2db7a6f827dc22 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "xla/tsl/util/reporter.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/util/reporter.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/sparse/BUILD b/tensorflow/core/util/sparse/BUILD
index 630cd02a690d85..5f140a988e1294 100644
--- a/tensorflow/core/util/sparse/BUILD
+++ b/tensorflow/core/util/sparse/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/core/util/stat_summarizer_options.h b/tensorflow/core/util/stat_summarizer_options.h
index daf3d3b3a9c2c9..71f9bf372454f7 100644
--- a/tensorflow/core/util/stat_summarizer_options.h
+++ b/tensorflow/core/util/stat_summarizer_options.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 #define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 
-#include "tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stat_summarizer_options.h"
 
 namespace tensorflow {
 using tsl::StatSummarizerOptions;
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index 90605aaa9da6d7..20c997ced374a7 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/util/stat_summarizer_options.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 76a0e41669d42e..4ca9b222fb114b 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -62,7 +62,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/lib/io:buffered_file",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
@@ -86,7 +86,7 @@ cc_library(
         "//tensorflow/core/platform:byte_order",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
-        "@local_tsl//tsl/util:byte_swap_array",
+        "@local_xla//xla/tsl/util:byte_swap_array",
     ],
 )
 
diff --git a/tensorflow/core/util/tensor_bundle/byte_swap_array.h b/tensorflow/core/util/tensor_bundle/byte_swap_array.h
index ed3b6e1445eabb..97315b12917744 100644
--- a/tensorflow/core/util/tensor_bundle/byte_swap_array.h
+++ b/tensorflow/core/util/tensor_bundle/byte_swap_array.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_BUNDLE_BYTE_SWAP_ARRAY_H_
 
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index f2db274c497c8a..8f58c0bdbcb6d9 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/base/call_once.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_bundle/naming.h"
 #include "tensorflow/core/util/tensor_slice_util.h"
 #include "tsl/lib/io/buffered_file.h"
-#include "tsl/util/byte_swap_array.h"
 
 #ifdef PLATFORM_WINDOWS
 #undef DeleteFile
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index ac9f918b494b30..ba13b74016ce7e 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include <cstdint>
 
-#include "tsl/util/use_cudnn.h"
+#include "xla/tsl/util/use_cudnn.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index 6f039c85b948ff..0d48191d69b9e6 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/distribute/experimental/rpc/kernels/BUILD b/tensorflow/distribute/experimental/rpc/kernels/BUILD
index 60fa55c7702ee4..611339854ec09c 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/BUILD
+++ b/tensorflow/distribute/experimental/rpc/kernels/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load("//tensorflow/distribute/experimental/rpc/kernels/oss:defs.bzl", "grpc_credentials_dependency")
 
 package(
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index e54625d1a7cac5..3d28e474d680ea 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -1,13 +1,13 @@
 #include "third_party/absl/strings/str_cat.h"
 #DTensor C++ runtime and libraries.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
+load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_dtensor_tpu_dependencies",
 )
-load("//tensorflow:tensorflow.default.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "if_google", "if_libtpu")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 default_visibility = [
     "//tensorflow/dtensor:dtensor-internal",
@@ -43,7 +43,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:env_var",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
@@ -373,11 +373,11 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:env_var",
         "@local_xla//xla:status_macros",
         "@local_xla//xla/stream_executor/tpu:c_api_decl",
         "@local_xla//xla/stream_executor/tpu:tpu_platform_interface",
         "@local_xla//xla/stream_executor/tpu:tpu_topology_external",
+        "@local_xla//xla/tsl/util:env_var",
     ] + tf_dtensor_tpu_dependencies(),
 )
 
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index cc3a92ec8d1c97..d169ba4c8595b6 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/c_api_decl.h"
 #include "xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "xla/stream_executor/tpu/tpu_topology.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -93,7 +94,6 @@ limitations under the License.
 #include "tensorflow/dtensor/proto/layout.pb.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 using tensorflow::EagerExecutor;
 
diff --git a/tensorflow/dtensor/cc/dtensor_utils.cc b/tensorflow/dtensor/cc/dtensor_utils.cc
index dc5f1f827befaa..f5261a719a82e6 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.cc
+++ b/tensorflow/dtensor/cc/dtensor_utils.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index c9501c52a58f00..18a1057808e981 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -264,8 +264,10 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla:xla_data_proto_cc",
diff --git a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
index feb41e226c21c0..d484322d42fea4 100644
--- a/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
+++ b/tensorflow/dtensor/mlir/dtensor_dialect/BUILD
@@ -1,8 +1,9 @@
-# DTensor MLIR dialect.
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
+# DTensor MLIR dialect.
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/dtensor/mlir/tests/BUILD b/tensorflow/dtensor/mlir/tests/BUILD
index 6cee3359c3d980..675792ac142ef8 100644
--- a/tensorflow/dtensor/mlir/tests/BUILD
+++ b/tensorflow/dtensor/mlir/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/dtensor/mlir/utils/BUILD b/tensorflow/dtensor/mlir/utils/BUILD
index f53ca53420b72a..56620400ec950e 100644
--- a/tensorflow/dtensor/mlir/utils/BUILD
+++ b/tensorflow/dtensor/mlir/utils/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/util:env_var",
         "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index 07e545f5851a83..5a12d4de95dcc0 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
+#include "xla/tsl/util/env_var.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/dtensor/cc/constants.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/spmd_expander_common.h"
 #include "tensorflow/dtensor/mlir/value_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 3e4feb2ce619e5..2f0bf583d2f095 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -2,12 +2,12 @@
 # Code examples referenced by adding_an_op
 
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_custom_op_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_exec_properties",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_custom_op_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index dcccad17807ab1..ae630e0247b246 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -1,6 +1,6 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/go/example_inception_inference_test.go b/tensorflow/go/example_inception_inference_test.go
index 475619c55a5472..c33a7a07b0013b 100644
--- a/tensorflow/go/example_inception_inference_test.go
+++ b/tensorflow/go/example_inception_inference_test.go
@@ -22,7 +22,6 @@ import (
 	"flag"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net/http"
 	"os"
@@ -88,7 +87,7 @@ func Example() {
 		log.Fatal(err)
 	}
 
-	model, err := ioutil.ReadFile(modelfile)
+	model, err := os.ReadFile(modelfile)
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -145,7 +144,7 @@ func printBestLabel(probabilities []float32, labels []string) {
 
 // Convert the image in filename to a Tensor suitable as input to the Inception model.
 func makeTensorFromImage(filename string) (*tf.Tensor, error) {
-	bytes, err := ioutil.ReadFile(filename)
+	bytes, err := os.ReadFile(filename)
 	if err != nil {
 		return nil, err
 	}
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 2b72b236a813a6..0c92d7e309aaca 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -39,7 +39,7 @@ import "C"
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
+	"os"
 	"path"
 	"reflect"
 	"sort"
@@ -96,7 +96,7 @@ func registeredOps() (*odpb.OpList, *apiDefMap, error) {
 }
 
 func updateAPIDefs(m *apiDefMap, dir string) error {
-	files, err := ioutil.ReadDir(dir)
+	files, err := os.ReadDir(dir)
 	if err != nil {
 		return err
 	}
@@ -104,7 +104,7 @@ func updateAPIDefs(m *apiDefMap, dir string) error {
 		if file.IsDir() || !strings.HasSuffix(file.Name(), ".pbtxt") {
 			continue
 		}
-		data, err := ioutil.ReadFile(path.Join(dir, file.Name()))
+		data, err := os.ReadFile(path.Join(dir, file.Name()))
 		if err != nil {
 			return fmt.Errorf("failed to read %q: %v", file.Name(), err)
 		}
diff --git a/tensorflow/go/genop/main.go b/tensorflow/go/genop/main.go
index 87c1d27c3b53d7..370a9aaec10a80 100644
--- a/tensorflow/go/genop/main.go
+++ b/tensorflow/go/genop/main.go
@@ -21,7 +21,6 @@ import (
 	"bytes"
 	"flag"
 	"go/format"
-	"io/ioutil"
 	"log"
 	"os"
 	"path/filepath"
@@ -42,7 +41,7 @@ func main() {
 		log.Fatal("-outfile must be set")
 	}
 	if *header != "" {
-		hdr, err := ioutil.ReadFile(*header)
+		hdr, err := os.ReadFile(*header)
 		if err != nil {
 			log.Fatalf("Unable to read %s: %v", *header, err)
 		}
@@ -64,7 +63,7 @@ func main() {
 	if err != nil {
 		log.Fatalf("Failed to generate valid source? 'go fmt' failed: %v", err)
 	}
-	if err := ioutil.WriteFile(*filename, formatted, 0644); err != nil {
+	if err := os.WriteFile(*filename, formatted, 0644); err != nil {
 		log.Fatalf("Failed to write to %q: %v", *filename, err)
 	}
 }
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3c41534f2338f8..8a825b310ca47c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -6733,6 +6733,34 @@ func ComputeDedupDataSize(scope *Scope, config string) (num_elements tf.Output)
 	return op.Output(0)
 }
 
+// An op computes the size of the deduplication data from embedding core and returns the updated config.
+//
+// This op is to compute size of the deduplication data so to provide this
+// information to the op that computes the tuple mask of deduplication data can
+// have static output shape.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns The size of the deduplicated data from infeed.
+func ComputeDedupDataSizeV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (num_elements tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "ComputeDedupDataSizeV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // An op computes tuple mask of deduplication data from embedding core.
 //
 // The deduplication data receiving from embedding core is a Tensor with
@@ -6763,6 +6791,39 @@ func ComputeDedupDataTupleMask(scope *Scope, config string) (output_shape tf.Out
 	return op.Output(0)
 }
 
+// An op computes tuple mask of deduplication data from embedding core.
+//
+// The deduplication data receiving from embedding core is a Tensor with
+// type=DT_VARIANT. The tensor itself is an XLA nested tuple, whose elements are
+// rank 1 tensors. This op is to represents types and length of these elements.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns A 2-D int tensor represent mask of deduplication data tuple generated by
+// `XlaRecvTPUEmbeddingDeduplicationData`. The tuple has several integer and float
+// type 1-D tensor tuple elements. The first dimenion of this output_shape 2-D
+// tensor is tensor type of tuple elements, `0` represents integer tensor, `1`
+// represents float tensor. The second dimension of `output_shape` gives length of
+// each tuple element.
+func ComputeDedupDataTupleMaskV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "ComputeDedupDataTupleMaskV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -17614,6 +17675,42 @@ func FinalizeTPUEmbedding(scope *Scope, common_config tf.Output, memory_config t
 	return scope.AddOperation(opspec)
 }
 
+// An op that finalizes the TPUEmbedding configuration.
+//
+// Arguments:
+//
+//	common_config: A string-encoded common configuration proto containing metadata
+//
+// about the TPUEmbedding partitioner output and the HBM size (in bytes) required
+// for operation.
+//
+//	memory_config: A string-encoded memory config proto containing metadata about
+//
+// the memory allocations reserved for TPUEmbedding.
+//
+// Returns:
+//
+//	embedding_partitions: A string-encoded embedding partitions proto describing how embedding tables are
+//
+// partitioned along their feature and ID.
+//
+//	hbm_buffers_config: A string-encoded HBM buffers config proto specifies where HBM buffers are
+//
+// located.
+func FinalizeTPUEmbeddingV2(scope *Scope, common_config tf.Output, memory_config tf.Output) (embedding_partitions tf.Output, hbm_buffers_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FinalizeTPUEmbeddingV2",
+		Input: []tf.Input{
+			common_config, memory_config,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Generates fingerprint values.
 //
 // Generates fingerprint values of `data`.
@@ -59057,6 +59154,60 @@ func XlaRecvTPUEmbeddingActivations(scope *Scope, deduplication_data tf.Output,
 	return outputs
 }
 
+// An op that receives embedding activations on the TPU.
+//
+// The TPU system performs the embedding lookups and aggregations. The results of
+// these aggregations are visible to the Tensorflow Graph as the outputs of a
+// XlaRecvTPUEmbeddingActivations Op. This op returns a list containing one
+// Tensor of activations per table specified in the model.
+//
+// Arguments:
+//
+//	deduplication_data: A Tensor with type=DT_VARIANT containing the deduplication
+//
+// data. The tensor is an XLA nested tuple containing N elements (where N is
+// the ratio of the number of embedding to tensor cores per TPU chip). Each
+// element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+// contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+// weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+//
+//	num_tables: The number of output activation tensors. If feature descriptor is
+//
+// present in the tpu embedding config, it is equal to the number of features
+// otherwise equal to number of embedding tables in the model.
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func XlaRecvTPUEmbeddingActivationsV2(scope *Scope, deduplication_data tf.Output, num_tables int64, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_tables": num_tables, "config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvTPUEmbeddingActivationsV2",
+		Input: []tf.Input{
+			deduplication_data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("XlaRecvTPUEmbeddingActivationsV2", err)
+		return
+	}
+	return outputs
+}
+
 // Receives deduplication data (indices and weights) from the embedding core.
 //
 // The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
@@ -59083,6 +59234,35 @@ func XlaRecvTPUEmbeddingDeduplicationData(scope *Scope, config string) (output t
 	return op.Output(0)
 }
 
+// Receives deduplication data (indices and weights) from the embedding core.
+//
+// The deduplication data is a Tensor with type=DT_VARIANT. The tensor itself is an
+// XLA nested tuple containing N elements (where N is the ratio of the number of
+// embedding to tensor cores per TPU chip). Each element of the nested tuple is a
+// tuple of rank 1 tensors. Each tensor either contains indices (DT_UINT32) for
+// embedding lookup on the TensorCore or weights (DT_FLOAT) to apply to the output
+// of the embedding lookup operation.
+//
+// Arguments:
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+func XlaRecvTPUEmbeddingDeduplicationDataV2(scope *Scope, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaRecvTPUEmbeddingDeduplicationDataV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Wraps the XLA ReducePrecision operator
 //
 //	documented at https://www.tensorflow.org/xla/operation_semantics#reduceprecision.
@@ -59331,6 +59511,52 @@ func XlaSendTPUEmbeddingGradients(scope *Scope, gradients []tf.Output, learning_
 	return scope.AddOperation(opspec)
 }
 
+// An op that performs gradient updates of embedding tables.
+//
+// The gradients argument is a TensorList having the same length and shapes as the
+// return value of XlaRecvTPUEmbeddingActivations, but contains gradients of the
+// model's loss with respect to the embedding activations. The embedding tables are
+// updated from these gradients via the optimizer specified in the
+// TPUEmbeddingConfiguration proto given to tpu.initialize_system.
+//
+// Arguments:
+//
+//	gradients: A TensorList of gradients with which to update embedding tables.
+//	learning_rates: A TensorList of learning rates used for updating the embedding
+//
+// tables via the optimizer. The length of the TensorList must be equal to the
+// number of dynamic learning rate tags specified in the
+// TPUEmbeddingConfiguration proto.
+//
+//	deduplication_data: A Tensor with type=DT_VARIANT containing the deduplication
+//
+// data. The tensor is an XLA nested tuple containing N elements (where N is
+// the ratio of the number of embedding to tensor cores per TPU chip). Each
+// element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+// contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+// weights (DT_FLOAT) to apply to the output of the embedding lookup operation.
+//
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//	embedding_partitions: Serialized EmbeddingPartitionsProto proto.
+//	hbm_buffers_config: Serialized HbmBuffersConfig proto.
+//	tpu_topology: Serialized TpuTopologyArgsProto proto.
+//
+// Returns the created operation.
+func XlaSendTPUEmbeddingGradientsV2(scope *Scope, gradients []tf.Output, learning_rates []tf.Output, deduplication_data tf.Output, config string, embedding_partitions string, hbm_buffers_config string, tpu_topology string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config, "embedding_partitions": embedding_partitions, "hbm_buffers_config": hbm_buffers_config, "tpu_topology": tpu_topology}
+	opspec := tf.OpSpec{
+		Type: "XlaSendTPUEmbeddingGradientsV2",
+		Input: []tf.Input{
+			tf.OutputList(gradients), tf.OutputList(learning_rates), deduplication_data,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // An op to send a tensor to the host.
 //
 // input: the tensor that will be sent to the host.
@@ -59517,17 +59743,13 @@ func XlaSplitNDPaddings(value []int64) XlaSplitNDAttr {
 //
 // Arguments:
 //
-//		input: Input tensor to split across all dimensions.
-//	  }
-//	  out_arg {
-//	    name: "outputs"
-//	    description: <<END
-//
-// Output slices based on input and num_splits defined, in row-major order.
+//	input: Input tensor to split across all dimensions.
 //
 //	num_splits: Number of ways to split per dimension. Shape dimensions must be evenly
 //
 // divisible.
+//
+// Returns Output slices based on input and num_splits defined, in row-major order.
 func XlaSplitND(scope *Scope, input tf.Output, N int64, num_splits []int64, optional ...XlaSplitNDAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index c529b2bcffa1f2..cab2e7254ee940 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -1,8 +1,6 @@
 # Description:
 # TensorFlow Java API.
 
-load(":build_defs.bzl", "JAVACOPTS")
-load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar")
 load(
     "//tensorflow:tensorflow.bzl",
     "VERSION",
@@ -13,6 +11,8 @@ load(
     "tf_custom_op_library",
     "tf_java_test",
 )
+load(":build_defs.bzl", "JAVACOPTS")
+load(":src/gen/gen_ops.bzl", "tf_java_op_gen_srcjar")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 2bc604973a7b38..527288dc5348ad 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -2,6 +2,8 @@
 # Java Native Interface (JNI) library intended for implementing the
 # TensorFlow Java API using the TensorFlow C library.
 
+load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
+
 package(default_visibility = [
     "//tensorflow/java:__pkg__",
     "//tensorflow/tools/android/inference_interface:__pkg__",
@@ -9,8 +11,6 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
-
 tf_cuda_library(
     name = "native",
     srcs = glob(["*.cc"]) + select({
diff --git a/tensorflow/js/BUILD b/tensorflow/js/BUILD
index b24aef63f9c9c5..5e34184fe327dc 100644
--- a/tensorflow/js/BUILD
+++ b/tensorflow/js/BUILD
@@ -1,15 +1,15 @@
 # Description:
 # JavaScript/TypeScript code generation for TensorFlow.js
 
-visibility = [
-    "//tensorflow:internal",
-]
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
 
+visibility = [
+    "//tensorflow:internal",
+]
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = visibility,
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 5c7d40745baca8..d97faeaaaeca08 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_not_windows", "if_oss", "tf_cc_test")
@@ -5,7 +6,6 @@ load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts", "tflite_copts_warnings", "tflite_linkopts_no_undefined", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "SPECIAL_RULES_DEPS", "internal_visibility_allowlist", "tflite_internal_cc_3p_api_deps_src_all_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tflite", "cc_library_with_tflite", "cc_test_with_tflite")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
@@ -1166,6 +1166,7 @@ cc_test(
 cc_test_with_tflite(
     name = "mutable_op_resolver_utils_test",
     srcs = ["mutable_op_resolver_utils_test.cc"],
+    tags = ["tflite_disable_mobile_test"],  # Due to b/144306101 and b/329899620.
     tflite_deps = [
         ":mutable_op_resolver_utils",
         ":test_util",
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index a61f537365a86e..7fa2ede1f4066d 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -50,6 +50,7 @@ if(NOT TENSORFLOW_SOURCE_DIR)
 endif()
 set(TF_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow")
 set(TSL_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/third_party/tsl")
+set(XLA_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/")
 set(TFLITE_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}")
 set(CMAKE_MODULE_PATH
   "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
@@ -161,6 +162,7 @@ find_package(ruy REQUIRED)
 # Include TSL, which is in tensorflow/third_party
 include_directories(
   ${TSL_SOURCE_DIR}
+  ${XLA_SOURCE_DIR}
 )
 # Download necessary dependencies.
 # Download pthreadpool source package if it doesn't exist.
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index 375d73fa2c23a4..a8e13de931c3b0 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -13,7 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
+# buildifier: disable=out-of-order-load
+
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library", "flatc_path")
+
+# copybara:comment_begin(oss-only)
+load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
+# copybara:comment_end
+
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
@@ -22,10 +29,6 @@ load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with
 # copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
 load(":build_defs.bzl", "flatbuffer_schema_compat_test")
 
-# copybara:comment_begin(oss-only)
-load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
-# copybara:comment_end
-
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/lite/acceleration/configuration/c/BUILD b/tensorflow/lite/acceleration/configuration/c/BUILD
index b441d77fb1e729..df9cdb4d9d621c 100644
--- a/tensorflow/lite/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/acceleration/configuration/c/BUILD
@@ -16,11 +16,11 @@
 # C API for delegate plugins.
 
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load(
     "//tensorflow/lite/core/shims:cc_library_with_tflite.bzl",
     "cc_library_with_tflite_with_c_headers_test",
 )
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 
 # LINT.IfChange(tflite_acceleration_exported_headers)
 exports_files([
diff --git a/tensorflow/lite/async/backend_async_kernel_interface.cc b/tensorflow/lite/async/backend_async_kernel_interface.cc
index ef7bc0563018a2..a3e0234bbc1fd9 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface.cc
@@ -152,6 +152,22 @@ TfLiteStatus Finish(TfLiteAsyncKernel* async_kernel,
       ->Finish(context, task);
 }
 
+TfLiteStatus SetBufferAttributes(TfLiteAsyncKernel* async_kernel,
+                                 const TfLiteBackendBuffer* buffer,
+                                 const TfLiteAttributeMap* attrs) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             TfLiteAsyncKernelGetKernelData(async_kernel))
+      ->SetBufferAttributes(buffer, attrs);
+}
+
+TfLiteStatus GetBufferAttributes(TfLiteAsyncKernel* async_kernel,
+                                 const TfLiteBackendBuffer* buffer,
+                                 TfLiteAttributeMap* attrs) {
+  return reinterpret_cast<BackendAsyncKernelInterface*>(
+             TfLiteAsyncKernelGetKernelData(async_kernel))
+      ->GetBufferAttributes(buffer, attrs);
+}
+
 }  // namespace internal
 
 BackendAsyncKernelInterface::BackendAsyncKernelInterface() {
@@ -167,6 +183,10 @@ BackendAsyncKernelInterface::BackendAsyncKernelInterface() {
   TfLiteAsyncKernelSetReconcileRestrictions(kernel_,
                                             internal::ReconcileRestrictions);
   TfLiteAsyncKernelSetSetAttributes(kernel_, internal::SetAttributes);
+  TfLiteAsyncKernelSetSetBufferAttributes(kernel_,
+                                          internal::SetBufferAttributes);
+  TfLiteAsyncKernelSetGetBufferAttributes(kernel_,
+                                          internal::GetBufferAttributes);
   TfLiteAsyncKernelSetPrepare(kernel_, internal::Prepare);
   TfLiteAsyncKernelSetEval(kernel_, internal::Eval);
   TfLiteAsyncKernelSetWait(kernel_, internal::Wait);
diff --git a/tensorflow/lite/async/backend_async_kernel_interface.h b/tensorflow/lite/async/backend_async_kernel_interface.h
index 2849c229c61cc9..c8d94341c417ad 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface.h
+++ b/tensorflow/lite/async/backend_async_kernel_interface.h
@@ -130,6 +130,20 @@ class BackendAsyncKernelInterface {
                                      TfLiteOpaqueNode* node, int tensor_index,
                                      const TfLiteAttributeMap* attrs) = 0;
 
+  // Set buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if the `attrs` for the `buffer` could be
+  // set in the corresponding async kernel.
+  virtual TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           const TfLiteAttributeMap* attrs) = 0;
+
+  // Get buffer's attributes. Backend will check if the buffer has been
+  // registered. And return TfLiteOk if provided `attrs` for the `buffer` could
+  // be found in the registration pool in corresponding async kernel. If `attrs`
+  // is a non-empty map, it will be overwritten by the attributes of the
+  // `buffer`.
+  virtual TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                           TfLiteAttributeMap* attrs) = 0;
+
   // Prepares the kernel using the information from Set[In|Out]putAttributes
   // call above.
   virtual TfLiteStatus Prepare(TfLiteOpaqueContext* context,
diff --git a/tensorflow/lite/async/backend_async_kernel_interface_test.cc b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
index 2620cc707a1ad0..e4f68bb32167b6 100644
--- a/tensorflow/lite/async/backend_async_kernel_interface_test.cc
+++ b/tensorflow/lite/async/backend_async_kernel_interface_test.cc
@@ -35,6 +35,8 @@ TEST(BackendAsyncKernelInterfaceTest, BasicTest) {
   EXPECT_CALL(kernel, UnregisterBuffer(_, _));
   EXPECT_CALL(kernel, ReconcileRestrictions(_, _, _, _, _, _));
   EXPECT_CALL(kernel, SetAttributes(_, _, _, _));
+  EXPECT_CALL(kernel, SetBufferAttributes(_, _));
+  EXPECT_CALL(kernel, GetBufferAttributes(_, _));
   EXPECT_CALL(kernel, Prepare(_, _));
   EXPECT_CALL(kernel, Eval(_, _, _));
   EXPECT_CALL(kernel, Wait(_, _));
@@ -49,6 +51,8 @@ TEST(BackendAsyncKernelInterfaceTest, BasicTest) {
   tflite_kernel->reconcile_restrictions(tflite_kernel, nullptr, nullptr, 0,
                                         nullptr, nullptr, nullptr);
   tflite_kernel->set_attributes(tflite_kernel, nullptr, nullptr, 0, nullptr);
+  tflite_kernel->set_buffer_attributes(tflite_kernel, nullptr, nullptr);
+  tflite_kernel->get_buffer_attributes(tflite_kernel, nullptr, nullptr);
   tflite_kernel->prepare(tflite_kernel, nullptr, nullptr);
   tflite_kernel->eval(tflite_kernel, nullptr, nullptr, nullptr);
   tflite_kernel->wait(tflite_kernel, nullptr, nullptr);
diff --git a/tensorflow/lite/async/testing/mock_async_kernel.h b/tensorflow/lite/async/testing/mock_async_kernel.h
index a3297f849b12f6..be31a2a71a843c 100644
--- a/tensorflow/lite/async/testing/mock_async_kernel.h
+++ b/tensorflow/lite/async/testing/mock_async_kernel.h
@@ -48,6 +48,11 @@ class MockAsyncKernel : public delegates::BackendAsyncKernelInterface {
               (TfLiteOpaqueContext*, TfLiteOpaqueNode*, int,
                const TfLiteAttributeMap*),
               (override));
+  MOCK_METHOD(TfLiteStatus, SetBufferAttributes,
+              (const TfLiteBackendBuffer*, const TfLiteAttributeMap*),
+              (override));
+  MOCK_METHOD(TfLiteStatus, GetBufferAttributes,
+              (const TfLiteBackendBuffer*, TfLiteAttributeMap*), (override));
   MOCK_METHOD(TfLiteStatus, Prepare, (TfLiteOpaqueContext*, TfLiteOpaqueNode*),
               (override));
   MOCK_METHOD(TfLiteStatus, Eval,
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index d207680c10ea09..2814d0da630889 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -170,10 +170,7 @@ def tflite_linkopts_no_undefined():
             # since undefined symbols in shared libraries (references to symbols
             # that will be defined in the main executable) are normal and
             # expected in those cases.
-            "//tools/cpp:asan_build": [],
-            "//tools/cpp:hwasan_build": [],
-            "//tools/cpp:msan_build": [],
-            "//tools/cpp:tsan_build": [],
+            "//tools/cpp:sanitizer_build": [],
             "//tensorflow:ios": [
                 "-Wl,-undefined,error",
             ],
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 22efc0feb27940..19cdd37ed4f549 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -356,7 +356,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/c:registration_external",
+        "//tensorflow/lite/core/c:operator",
     ],
 )
 
@@ -393,7 +393,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/core/c:registration_external_without_alwayslink",
+        "//tensorflow/lite/core/c:operator_without_alwayslink",
     ],
 )
 
diff --git a/tensorflow/lite/c/CMakeLists.txt b/tensorflow/lite/c/CMakeLists.txt
index 3fb086ec91a878..b589d1ab8dee91 100644
--- a/tensorflow/lite/c/CMakeLists.txt
+++ b/tensorflow/lite/c/CMakeLists.txt
@@ -66,7 +66,7 @@ add_library(tensorflowlite_c ${TFLITE_C_LIBTYPE}
   ${TFLITE_SOURCE_DIR}/core/c/c_api.cc
   ${TFLITE_SOURCE_DIR}/core/c/c_api_experimental.cc
   ${TFLITE_SOURCE_DIR}/core/c/common.cc
-  ${TFLITE_SOURCE_DIR}/core/c/registration_external.cc
+  ${TFLITE_SOURCE_DIR}/core/c/operator.cc
   builtin_op_data.h
   c_api.h
   c_api_experimental.h
diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 4b09cf88440bcf..01938c8141294b 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -23,4 +23,36 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/c_api.h"
 
+#ifndef DOYXGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use the names starting with TfLiteOperator instead.
+#ifdef __cplusplus
+using TfLiteRegistrationExternal = TfLiteOperator;
+// NOLINTBEGIN
+const auto TfLiteRegistrationExternalCreate = TfLiteOperatorCreate;
+const auto TfLiteRegistrationExternalGetBuiltInCode =
+    TfLiteOperatorGetBuiltInCode;
+const auto TfLiteRegistrationExternalGetVersion = TfLiteOperatorGetVersion;
+const auto TfLiteRegistrationExternalDelete = TfLiteOperatorDelete;
+const auto TfLiteRegistrationExternalSetInit = TfLiteOperatorSetInit;
+const auto TfLiteRegistrationExternalSetFree = TfLiteOperatorSetFree;
+const auto TfLiteRegistrationExternalSetPrepare = TfLiteOperatorSetPrepare;
+const auto TfLiteRegistrationExternalSetInvoke = TfLiteOperatorSetInvoke;
+const auto TfLiteRegistrationExternalGetCustomName =
+    TfLiteOperatorGetCustomName;
+// NOLINTEND
+#else
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#define TfLiteRegistrationExternalCreate TfLiteOperatorCreate
+#define TfLiteRegistrationExternalGetBuiltInCode TfLiteOperatorGetBuiltInCode
+#define TfLiteRegistrationExternalGetVersion TfLiteOperatorGetVersion
+#define TfLiteRegistrationExternalDelete TfLiteOperatorDelete
+#define TfLiteRegistrationExternalSetInit TfLiteOperatorSetInit
+#define TfLiteRegistrationExternalSetFree TfLiteOperatorSetFree
+#define TfLiteRegistrationExternalSetPrepare TfLiteOperatorSetPrepare
+#define TfLiteRegistrationExternalSetInvoke TfLiteOperatorSetInvoke
+#define TfLiteRegistrationExternalGetCustomName TfLiteOperatorGetCustomName
+#endif  // __cplusplus
+#endif  // DOYXGEN_SKIP
+
 #endif  // TENSORFLOW_LITE_C_C_API_H_
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 60e11798f78e5d..8249f22eaaa463 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -93,13 +93,13 @@ struct TfLiteOpResolverCallbacks {
                                              int version)>
       find_custom_op_v1;
 
-  // Variant of `find_builtin_op` which returns `TfLiteRegistrationExternal`.
-  std::function<const TfLiteRegistrationExternal*(
-      void* user_data, TfLiteBuiltinOperator op, int version)>
+  // Variant of `find_builtin_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, TfLiteBuiltinOperator op,
+                                      int version)>
       find_builtin_op_external;
-  // Variant of `find_custom_op` which returns `TfLiteRegistrationExternal`.
-  std::function<const TfLiteRegistrationExternal*(void* user_data,
-                                                  const char* op, int version)>
+  // Variant of `find_custom_op` which returns `TfLiteOperator`.
+  std::function<const TfLiteOperator*(void* user_data, const char* op,
+                                      int version)>
       find_custom_op_external;
 };
 
@@ -135,9 +135,9 @@ struct TfLiteInterpreterOptions {
   // automatically retried without delegates.
   bool enable_delegate_fallback = false;
 
-  // TfLiteRegistrationExternal objects owned by caller of
-  // `TfLiteInterpreterOptionsAddRegistrationExternal` API.
-  std::vector<TfLiteRegistrationExternal*> op_registrations;
+  // TfLiteOperator objects owned by caller of
+  // `TfLiteInterpreterOptionsAddOperator` API.
+  std::vector<TfLiteOperator*> op_registrations;
 
   // Determines whether to allow to cancel invocations with
   // `Interpreter::Cancel` or `SignatureRunner::Cancel`.
diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index 2f7f1113cb5bb2..cead720c0c7dcc 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -18,8 +18,9 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/c/registration_external.h"
+#include "tensorflow/lite/core/c/operator.h"
 #include "tensorflow/lite/core/subgraph.h"
 
 namespace tflite {
@@ -28,13 +29,13 @@ namespace internal {
 namespace {
 
 // Returns a dynamically allocated object; the caller is responsible for
-// deallocating it using TfLiteRegistrationExternalDelete.
-TfLiteRegistrationExternal* MakeRegistrationExternal(
-    const TfLiteRegistration* registration, int node_index) {
-  // We need to allocate a new TfLiteRegistrationExternal object and then
+// deallocating it using TfLiteOperatorDelete.
+TfLiteOperator* MakeOperator(const TfLiteRegistration* registration,
+                             int node_index) {
+  // We need to allocate a new TfLiteOperator object and then
   // populate its state correctly, based on the contents in 'registration'.
 
-  auto* registration_external = TfLiteRegistrationExternalCreate(
+  auto* registration_external = TfLiteOperatorCreate(
       static_cast<TfLiteBuiltinOperator>(registration->builtin_code),
       registration->custom_name, registration->version);
 
@@ -45,9 +46,8 @@ TfLiteRegistrationExternal* MakeRegistrationExternal(
 
 }  // anonymous namespace
 
-TfLiteRegistrationExternal*
-CommonOpaqueConversionUtil::CachedObtainRegistrationExternal(
-    RegistrationExternalsCache* registration_externals_cache,
+TfLiteOperator* CommonOpaqueConversionUtil::CachedObtainOperator(
+    OperatorsCache* registration_externals_cache,
     const TfLiteRegistration* registration, int node_index) {
   OpResolver::OpId op_id{registration->builtin_code, registration->custom_name,
                          registration->version};
@@ -55,25 +55,22 @@ CommonOpaqueConversionUtil::CachedObtainRegistrationExternal(
   if (it != registration_externals_cache->end()) {
     return it->second.get();
   }
-  auto* registration_external =
-      MakeRegistrationExternal(registration, node_index);
+  auto* registration_external = MakeOperator(registration, node_index);
   registration_externals_cache->insert(
       it, std::make_pair(op_id, registration_external));
 
   return registration_external;
 }
 
-TfLiteRegistrationExternal*
-CommonOpaqueConversionUtil::ObtainRegistrationExternal(
+TfLiteOperator* CommonOpaqueConversionUtil::ObtainOperator(
     TfLiteContext* context, const TfLiteRegistration* registration,
     int node_index) {
   auto* subgraph = static_cast<tflite::Subgraph*>(context->impl_);
   if (!subgraph->registration_externals_) {
-    subgraph->registration_externals_ =
-        std::make_shared<RegistrationExternalsCache>();
+    subgraph->registration_externals_ = std::make_shared<OperatorsCache>();
   }
-  return CachedObtainRegistrationExternal(
-      subgraph->registration_externals_.get(), registration, node_index);
+  return CachedObtainOperator(subgraph->registration_externals_.get(),
+                              registration, node_index);
 }
 
 }  // namespace internal
diff --git a/tensorflow/lite/c/c_api_opaque_internal.h b/tensorflow/lite/c/c_api_opaque_internal.h
index 08989145b2fb95..54647b676b3736 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.h
+++ b/tensorflow/lite/c/c_api_opaque_internal.h
@@ -31,32 +31,31 @@ class CommonOpaqueConversionUtil {
  public:
   CommonOpaqueConversionUtil() = delete;
 
-  // Obtain (or create) a 'TfLiteRegistrationExternal' object that corresponds
+  // Obtain (or create) a 'TfLiteOperator' object that corresponds
   // to the provided 'registration' argument, and return the address of the
   // external registration.  We loosely define that a
-  // 'TfLiteRegistrationExternal' object "corresponds" to a 'TfLiteRegistration'
+  // 'TfLiteOperator' object "corresponds" to a 'TfLiteRegistration'
   // object when calling any function pointer (like 'prepare') on the
-  // 'TfLiteRegistrationExternal' object calls into the corresponding function
+  // 'TfLiteOperator' object calls into the corresponding function
   // pointer of the 'TfLiteRegistration' object.
   //
   // The specified 'context' or 'op_resolver' object is used to store the
-  // 'TfLiteRegistrationExternal*' pointers. The 'TfLiteRegistrationExternal*'
+  // 'TfLiteOperator*' pointers. The 'TfLiteOperator*'
   // pointer will be deallocated when that object gets destroyed.  I.e., the
   // caller of this function should not deallocate the object pointed to by the
-  // return value of 'ObtainRegistrationExternal'.
+  // return value of 'ObtainOperator'.
   //
   // We also need to provide the 'node_index' that the 'registration'
-  // corresponds to, so that the 'TfLiteRegistrationExternal' can store that
+  // corresponds to, so that the 'TfLiteOperator' can store that
   // index within its fields.  If the registration does not yet correspond
   // to a specific node index, then 'node_index' should be -1.
-  static TfLiteRegistrationExternal* ObtainRegistrationExternal(
-      TfLiteContext* context, const TfLiteRegistration* registration,
-      int node_index);
+  static TfLiteOperator* ObtainOperator(TfLiteContext* context,
+                                        const TfLiteRegistration* registration,
+                                        int node_index);
 
  private:
-  static TfLiteRegistrationExternal* CachedObtainRegistrationExternal(
-      ::tflite::internal::RegistrationExternalsCache*
-          registration_externals_cache,
+  static TfLiteOperator* CachedObtainOperator(
+      ::tflite::internal::OperatorsCache* registration_externals_cache,
       const TfLiteRegistration* registration, int node_index);
 };
 
diff --git a/tensorflow/lite/c/c_api_opaque_internal_test.cc b/tensorflow/lite/c/c_api_opaque_internal_test.cc
index 0965b85f173c60..8e63b045de44bb 100644
--- a/tensorflow/lite/c/c_api_opaque_internal_test.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal_test.cc
@@ -43,9 +43,8 @@ TEST(ObtainRegistrationFromContext, ProducesValidResult) {
   TfLiteContext* context = interpreter->primary_subgraph().context();
   const TfLiteRegistration* registration = tflite::ops::builtin::Register_ADD();
 
-  TfLiteRegistrationExternal* registration_external =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 42);
+  TfLiteOperator* registration_external =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 42);
 
   ASSERT_EQ(registration_external->builtin_code, kTfLiteBuiltinAdd);
   ASSERT_EQ(registration_external->version, registration->version);
@@ -66,11 +65,9 @@ TEST(ObtainRegistrationFromContext, CachingWorks) {
   const TfLiteRegistration* registration = tflite::ops::builtin::Register_ADD();
 
   // Call it twice, and verify that we get the same result back.
-  TfLiteRegistrationExternal* registration_external1 =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 0);
-  TfLiteRegistrationExternal* registration_external2 =
-      CommonOpaqueConversionUtil::ObtainRegistrationExternal(context,
-                                                             registration, 1);
+  TfLiteOperator* registration_external1 =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 0);
+  TfLiteOperator* registration_external2 =
+      CommonOpaqueConversionUtil::ObtainOperator(context, registration, 1);
   ASSERT_EQ(registration_external1, registration_external2);
 }
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index 3a933707d50a63..b47c38b6922e5a 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -24,11 +24,11 @@ limitations under the License.
 // NOTE: This header does not follow C conventions and does not define a C API.
 // It is effectively an (internal) implementation detail of the C API.
 
-// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+// `TfLiteOperator` is an external version of `TfLiteRegistration`
 // for C API which doesn't use internal types (such as `TfLiteContext`) but only
 // uses stable API types (such as `TfLiteOpaqueContext`). The purpose of each
 // field is the exactly the same as with `TfLiteRegistration`.
-typedef struct TfLiteRegistrationExternal {
+typedef struct TfLiteOperator {
   // Custom op name.  This should be non-null iff the op is a custom op,
   // i.e. iff builtin_code is kTfLiteBuiltinCustom.
   const char* custom_name;
@@ -81,7 +81,7 @@ typedef struct TfLiteRegistrationExternal {
   // Indicates if an operator's output can safely overwrite its input.
   // See the comments in `TfLiteInPlaceOp`.
   uint64_t inplace_operator;
-} TfLiteRegistrationExternal;
+} TfLiteOperator;
 
 // Returns true iff it's safe to dereference
 // 'delegate->opaque_delegate_builder'.
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index e5ace050bb9a10..4b8859346ad71f 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -1,8 +1,8 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core:special_rules.bzl", "macros_visibility_allowlist")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index a0e28f1ccaaf8b..1d6e1ca1eed47a 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "op_resolver_internal_visibility_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -154,6 +154,7 @@ cc_test(
         ":api",
         "//tensorflow/lite:string",
         "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 10feeb3fc2c7dd..d36c2b69f4058a 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -1017,6 +1017,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_FLOAT16:
       *type = kTfLiteFloat16;
       return kTfLiteOk;
+    case TensorType_BFLOAT16:
+      *type = kTfLiteBFloat16;
+      return kTfLiteOk;
     case TensorType_FLOAT32:
       *type = kTfLiteFloat32;
       return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 6e08e6880e5522..87c897dfc0928e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
 
@@ -189,6 +190,13 @@ TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeFloat16) {
   EXPECT_EQ(kTfLiteFloat16, type);
 }
 
+TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeBFloat16) {
+  TfLiteType type;
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_BFLOAT16, &type, &mock_reporter_));
+  EXPECT_EQ(kTfLiteBFloat16, type);
+}
+
 TEST_F(FlatbufferConversionsTest, TestConvertTensorTypeInt4) {
   TfLiteType type;
   EXPECT_EQ(kTfLiteOk,
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index b43f1adc664ada..7aff7cafea1783 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -34,7 +34,7 @@ class Subgraph;            // For friend declaration below.
 
 namespace internal {
 class CommonOpaqueConversionUtil;  // For friend declaration below.
-class RegistrationExternalsCache;  // Forward decl.
+class OperatorsCache;              // Forward decl.
 }  // namespace internal
 #endif
 
@@ -45,7 +45,7 @@ class RegistrationExternalsCache;  // Forward decl.
 /// The lifetime of the TfLiteRegistration object whose address is
 /// returned by FindOp must exceed the lifetime of any InterpreterBuilder or
 /// Interpreter created with this OpResolver.
-/// Likewise the lifetime of the TfLiteRegistrationExternal object referenced
+/// Likewise the lifetime of the TfLiteOperator object referenced
 /// from the TfLiteRegistration object, if any, must exceed the lifetime of
 /// any InterpreterBuilder or Interpreter created with this OpResolver.
 class OpResolver {
@@ -139,11 +139,11 @@ class OpResolver {
   friend class OpResolverInternal;
   friend class Subgraph;  // For OpId.
   friend class tflite::internal::CommonOpaqueConversionUtil;
-  friend class tflite::internal::RegistrationExternalsCache;
+  friend class tflite::internal::OperatorsCache;
 #endif
 
   // This holds the identity of an operator.
-  // Ths is used as the key for the RegistrationExternalsCache below.
+  // Ths is used as the key for the OperatorsCache below.
   struct OpId {
     int builtin_code;
     const char* custom_name;
@@ -176,33 +176,33 @@ class OpResolver {
     };
   };
 
-  // A set of 'TfLiteRegistrationExternal' objects whose lifetimes need to
+  // A set of 'TfLiteOperator' objects whose lifetimes need to
   // last at least as long as the lifetime of the OpResolver.
   // We use shared_ptr rather than unique_ptr here, to allow the
-  // RegistrationExternalsCache to be shared with other classes such as the
+  // OperatorsCache to be shared with other classes such as the
   // InterpreterBuilder and Interpreter. This is so that the
-  // TfLiteRegistrationExternal objects allocated by an OpResolver,
+  // TfLiteOperator objects allocated by an OpResolver,
   // which may be referenced by a Subgraph in an Interpreter, can remain live
   // even if the OpResolver is destroyed, while also allowing the same
   // OpResolver to be used with multiple InterpreterBuilders and multiple
   // Interpreters.
-  mutable std::shared_ptr<internal::RegistrationExternalsCache>
+  mutable std::shared_ptr<internal::OperatorsCache>
       registration_externals_cache_;
 };
 
 #ifndef DOXYGEN_SKIP
-// Type for a set of owned 'TfLiteRegistrationExternal' objects.
+// Type for a set of owned 'TfLiteOperator' objects.
 // This is needed when converting TfLiteRegistration to
-// TfLiteRegistrationExternal, to ensure that the number of
-// TfLiteRegistrationExternal objects that we allocate is bounded, and to
+// TfLiteOperator, to ensure that the number of
+// TfLiteOperator objects that we allocate is bounded, and to
 // ensure that those objects get deallocated at the appropriate time.
 // We use a public class rather than a typedef or using declaration here,
 // to ensure that the class can be forward-declared.
 // WARNING: Experimental interface, subject to change.
 namespace internal {
-class RegistrationExternalsCache
+class OperatorsCache
     : private std::unordered_map<OpResolver::OpId,
-                                 std::unique_ptr<TfLiteRegistrationExternal>,
+                                 std::unique_ptr<TfLiteOperator>,
                                  OpResolver::OpId::Hasher> {
   friend class ::tflite::Subgraph;
   friend class ::tflite::internal::CommonOpaqueConversionUtil;
diff --git a/tensorflow/lite/core/api/op_resolver_internal.h b/tensorflow/lite/core/api/op_resolver_internal.h
index 9f6b893e5a6940..b30debe944eea9 100644
--- a/tensorflow/lite/core/api/op_resolver_internal.h
+++ b/tensorflow/lite/core/api/op_resolver_internal.h
@@ -35,13 +35,12 @@ class OpResolverInternal {
     return op_resolver.MayContainUserDefinedOps();
   }
 
-  // Get a shared_ptr to the RegistrationExternalsCache from an OpResolver.
+  // Get a shared_ptr to the OperatorsCache from an OpResolver.
   // This is used to allow the InterpreterBuilder and OpResolver to share
-  // the same RegistrationExternalsCache, so that the RegistrationExternal
-  // objects in it can persist for the lifetimes of both the InterpreterBuilder
-  // and OpResolver.
-  static std::shared_ptr<::tflite::internal::RegistrationExternalsCache>
-  GetSharedCache(const ::tflite::OpResolver& op_resolver) {
+  // the same OperatorsCache, so that the Operator objects in it can persist
+  // for the lifetimes of both the InterpreterBuilder and OpResolver.
+  static std::shared_ptr<::tflite::internal::OperatorsCache> GetSharedCache(
+      const ::tflite::OpResolver& op_resolver) {
     return op_resolver.registration_externals_cache_;
   }
 };
diff --git a/tensorflow/lite/core/async/async_kernel_internal.h b/tensorflow/lite/core/async/async_kernel_internal.h
index 2ce473a029e516..efc341be8b3f0c 100644
--- a/tensorflow/lite/core/async/async_kernel_internal.h
+++ b/tensorflow/lite/core/async/async_kernel_internal.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
 #define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_KERNEL_INTERNAL_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <vector>
 
@@ -115,6 +116,16 @@ struct TfLiteAsyncKernel {
                                  TfLiteOpaqueNode* node, int tensor_index,
                                  const TfLiteAttributeMap* attrs) = nullptr;
 
+  // Set attributes to the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*set_buffer_attributes)(
+      TfLiteAsyncKernel* async_kernel, const TfLiteBackendBuffer* buffer,
+      const TfLiteAttributeMap* attrs) = nullptr;
+
+  // Get attributes from the buffer, backend kernel will validate the buffer.
+  TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                        const TfLiteBackendBuffer* buffer,
+                                        TfLiteAttributeMap* attrs) = nullptr;
+
   // Prepares the kernel using the information from Set[In|Out]putAttributes
   // call above.
   TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
diff --git a/tensorflow/lite/core/async/async_signature_runner.cc b/tensorflow/lite/core/async/async_signature_runner.cc
index d87f9f40683ab1..ad1ee15a601bdb 100644
--- a/tensorflow/lite/core/async/async_signature_runner.cc
+++ b/tensorflow/lite/core/async/async_signature_runner.cc
@@ -137,6 +137,16 @@ TfLiteStatus AsyncSignatureRunner::SetAttributes(
   return async_subgraph_->SetAttributes(tensor_index, attrs);
 }
 
+TfLiteStatus AsyncSignatureRunner::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  return async_subgraph_->SetBufferAttributes(buffer, attrs);
+}
+
+TfLiteStatus AsyncSignatureRunner::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  return async_subgraph_->GetBufferAttributes(buffer, attrs);
+}
+
 TfLiteStatus AsyncSignatureRunner::PrepareBackends() {
   return async_subgraph_->Prepare();
 }
diff --git a/tensorflow/lite/core/async/async_signature_runner.h b/tensorflow/lite/core/async/async_signature_runner.h
index b23a460debea29..d0a85a6c7eb038 100644
--- a/tensorflow/lite/core/async/async_signature_runner.h
+++ b/tensorflow/lite/core/async/async_signature_runner.h
@@ -122,6 +122,17 @@ class AsyncSignatureRunner {
   // Returns true if all backends accept the `attrs`.
   TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
 
+  // Set the attributes of a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer is not registered.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes from a specific buffer. Returns
+  // kTfLiteDelegateError if the buffer has not been found in the
+  // backends.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
   // Prepares delegate backends for execution.
   // Must be called after calling `SetAttributes`.
   TfLiteStatus PrepareBackends();
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
index 7a575372049a07..11fcef091be9ff 100644
--- a/tensorflow/lite/core/async/async_subgraph.cc
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -179,6 +179,16 @@ TfLiteStatus AsyncSubgraph::SetAttributes(int tensor_index,
                                           opaque_node_, tensor_index, attrs);
 }
 
+TfLiteStatus AsyncSubgraph::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  return (*async_kernel_->set_buffer_attributes)(async_kernel_, buffer, attrs);
+}
+
+TfLiteStatus AsyncSubgraph::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  return (*async_kernel_->get_buffer_attributes)(async_kernel_, buffer, attrs);
+}
+
 TfLiteStatus AsyncSubgraph::Prepare() {
   if (async_kernel() == nullptr) return kTfLiteError;
   return (*async_kernel_->prepare)(async_kernel_, opaque_context(),
diff --git a/tensorflow/lite/core/async/async_subgraph.h b/tensorflow/lite/core/async/async_subgraph.h
index edf87ecaae72d1..cf4f3c905ca381 100644
--- a/tensorflow/lite/core/async/async_subgraph.h
+++ b/tensorflow/lite/core/async/async_subgraph.h
@@ -109,6 +109,24 @@ class AsyncSubgraph {
   // Returns true if all backends accept the `attrs`.
   TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
 
+  // Set the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+  // and stored in the map with the buffer. `buffer` and `attrs` should not be
+  // nullptr. The buffer needs to be registered before calling this function.
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs);
+
+  // Get the attributes for a specific buffer. `attrs` should be initialized
+  // before calling this function and could be constructed by calling
+  // TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+  // obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+  // overwritten by the attributes of the buffer. `buffer` and `attrs` should
+  // not be nullptr. The buffer needs to be registered before calling this
+  // function.
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs);
+
   // Prepares delegate backends for execution.
   // Must be called after calling `SetAttributes`.
   TfLiteStatus Prepare();
diff --git a/tensorflow/lite/core/async/c/BUILD b/tensorflow/lite/core/async/c/BUILD
index af618aaa22300c..e9a8bf9ae6c7cc 100644
--- a/tensorflow/lite/core/async/c/BUILD
+++ b/tensorflow/lite/core/async/c/BUILD
@@ -2,13 +2,13 @@
 # For clients using async APIs, please use tensorflow/lite/async/c instead of this package.
 # NOTE: Targets in this package are experimental.
 
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
     "tflite_copts_warnings",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/async/c/async_kernel.cc b/tensorflow/lite/core/async/c/async_kernel.cc
index e220014954926c..08e6f0f2f8581f 100644
--- a/tensorflow/lite/core/async/c/async_kernel.cc
+++ b/tensorflow/lite/core/async/c/async_kernel.cc
@@ -100,6 +100,24 @@ void TfLiteAsyncKernelSetSetAttributes(
   async_kernel->set_attributes = set_attributes;
 }
 
+void TfLiteAsyncKernelSetSetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          const TfLiteAttributeMap* attrs)) {
+  if (!async_kernel) return;
+  async_kernel->set_buffer_attributes = set_buffer_attributes;
+}
+
+void TfLiteAsyncKernelSetGetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          TfLiteAttributeMap* attrs)) {
+  if (!async_kernel) return;
+  async_kernel->get_buffer_attributes = get_buffer_attributes;
+};
+
 void TfLiteAsyncKernelSetPrepare(
     TfLiteAsyncKernel* async_kernel,
     TfLiteStatus (*prepare)(TfLiteAsyncKernel* async_kernel,
diff --git a/tensorflow/lite/core/async/c/async_kernel.h b/tensorflow/lite/core/async/c/async_kernel.h
index 1b3c76acee6324..e53eca75c70f65 100644
--- a/tensorflow/lite/core/async/c/async_kernel.h
+++ b/tensorflow/lite/core/async/c/async_kernel.h
@@ -178,6 +178,44 @@ TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetAttributes(
                                    TfLiteOpaqueNode* node, int tensor_index,
                                    const TfLiteAttributeMap* attrs));
 
+/// Sets the callback for the backend to set buffer attributes.
+///
+/// `set_buffer_attributes`:
+/// Sets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// update the map in the backend, so that the callers can retrieve specific
+/// buffer's attributes. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). The attributes will be sent to backend kernels
+/// and stored in the map with the buffer. `buffer` and `attrs` should not be
+/// nullptr. The buffer needs to be registered before calling this
+/// function. Returns kTfLiteOk if the buffer has been registered and
+/// callers can successfully set the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetSetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*set_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          const TfLiteAttributeMap* attrs));
+
+/// Sets the callback for the backend to get buffer attributes.
+///
+/// `get_buffer_attributes`:
+/// Gets the attributes of the buffers.
+/// Backend kernel will check if the provided buffer has been registered, and
+/// get the corresponding attributes from the map. `attrs` should be initialized
+/// before calling this function and could be constructed by calling
+/// TfLiteAttributeMapCreate(). `attrs` will be used to store the attributes
+/// obtained from the backend kernel. If `attrs` is a non-empty map, it will be
+/// overwritten by the attributes of the buffer. `buffer` and `attrs` should not
+/// be nullptr. The buffer needs to be registered before calling this function.
+/// Returns kTfLiteOk if the buffer has been registered and callers can
+/// successfully get the attributes for a buffer.
+TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetGetBufferAttributes(
+    TfLiteAsyncKernel* async_kernel,
+    TfLiteStatus (*get_buffer_attributes)(TfLiteAsyncKernel* async_kernel,
+                                          const TfLiteBackendBuffer* buffer,
+                                          TfLiteAttributeMap* attrs));
+
 /// Sets the callback to prepare the kernels using the information from
 /// `set_attributes` calls.
 TFL_CAPI_EXPORT extern void TfLiteAsyncKernelSetPrepare(
diff --git a/tensorflow/lite/core/async/interop/c/BUILD b/tensorflow/lite/core/async/interop/c/BUILD
index 54f37ed48b1177..13f81e545b46bb 100644
--- a/tensorflow/lite/core/async/interop/c/BUILD
+++ b/tensorflow/lite/core/async/interop/c/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 7f10784c10ddad..1b0bccb9e4857e 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -1,3 +1,4 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -10,7 +11,6 @@ load(
     "c_api_visibility_allowlist",
     "common_header_visibility_allowlist",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -25,7 +25,7 @@ exports_files(
         "c_api_opaque.h",
         "c_api_types.h",
         "common.h",
-        "registration_external.h",
+        "operator.h",
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
@@ -44,7 +44,7 @@ filegroup(
         "c_api.h",
         "c_api_types.h",
         "common.h",
-        "registration_external.h",
+        "operator.h",
     ],
 )
 
@@ -73,7 +73,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         ":c_api_without_op_resolver",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:create_op_resolver_with_builtin_ops",
         "//tensorflow/lite/c:common",
@@ -93,7 +93,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_internal",
@@ -121,7 +121,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite/core/async/c:types",
     ],
@@ -137,7 +137,7 @@ tflite_cc_library_with_c_headers_test(
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_types",
-        ":registration_external_without_alwayslink",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
@@ -181,6 +181,7 @@ cc_test(
         ":c_api_experimental",
         ":c_api_types",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core:subgraph",
@@ -208,6 +209,7 @@ cc_test(
         ":c_api_experimental",
         ":c_api_types",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core:subgraph",
@@ -234,6 +236,7 @@ cc_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/c:selectively_built_c_api_test_lib",
@@ -306,7 +309,7 @@ cc_test(
         ":c_api",
         ":c_api_types",
         ":common",
-        ":registration_external",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
@@ -378,6 +381,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_opaque",
         ":c_api_types",
         ":common",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:signature_runner",
         "//tensorflow/lite/c:c_api_internal",
@@ -413,6 +417,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver",
         ":common",
+        ":operator",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
@@ -466,6 +471,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string_util",
@@ -507,6 +513,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_opaque_without_op_resolver",
         ":c_api_types",
         ":common",
+        ":operator",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal",
@@ -530,6 +537,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver",
         ":common",
+        ":operator",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal",
@@ -566,6 +574,7 @@ tflite_cc_library_with_c_headers_test(
         ":c_api_types",
         ":c_api_without_op_resolver_without_alwayslink",
         ":common",
+        ":operator_without_alwayslink",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_opaque_internal_without_alwayslink",
@@ -609,9 +618,9 @@ cc_test(
 )
 
 tflite_cc_library_with_c_headers_test(
-    name = "registration_external",
-    srcs = ["registration_external.cc"],
-    hdrs = ["registration_external.h"],
+    name = "operator",
+    srcs = ["operator.cc"],
+    hdrs = ["operator.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     visibility = [
@@ -630,9 +639,9 @@ tflite_cc_library_with_c_headers_test(
 )
 
 tflite_cc_library_with_c_headers_test(
-    name = "registration_external_without_alwayslink",
-    srcs = ["registration_external.cc"],
-    hdrs = ["registration_external.h"],
+    name = "operator_without_alwayslink",
+    srcs = ["operator.cc"],
+    hdrs = ["operator.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index e5150cc69c35bb..5b2011e5757e89 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -132,9 +132,8 @@ void TfLiteInterpreterOptionsSetErrorReporter(
   options->error_reporter_callback.user_data = user_data;
 }
 
-void TfLiteInterpreterOptionsAddRegistrationExternal(
-    TfLiteInterpreterOptions* options,
-    TfLiteRegistrationExternal* registration) {
+void TfLiteInterpreterOptionsAddOperator(TfLiteInterpreterOptions* options,
+                                         TfLiteOperator* registration) {
   options->op_registrations.push_back(registration);
 }
 
@@ -144,9 +143,8 @@ TfLiteStatus TfLiteInterpreterOptionsEnableCancellation(
   return kTfLiteOk;
 }
 
-static void InitTfLiteRegistration(
-    TfLiteRegistration* registration,
-    TfLiteRegistrationExternal* registration_external) {
+static void InitTfLiteRegistration(TfLiteRegistration* registration,
+                                   TfLiteOperator* registration_external) {
   registration->builtin_code = registration_external->builtin_code;
   registration->custom_name = registration_external->custom_name;
   registration->version = registration_external->version;
@@ -282,13 +280,13 @@ TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
 namespace tflite {
 namespace internal {
 
-static TfLiteRegistration* RegistrationExternalToRegistration(
-    const TfLiteRegistrationExternal* registration_external) {
-  // All TfLiteRegistrationExternal objects are dynamically allocated via
-  // TfLiteRegistrationExternalCreate(), so they are guaranteed
+static TfLiteRegistration* OperatorToRegistration(
+    const TfLiteOperator* registration_external) {
+  // All TfLiteOperator objects are dynamically allocated via
+  // TfLiteOperatorCreate(), so they are guaranteed
   // to be mutable, hence the const_cast below should be safe.
   auto registration_external_non_const =
-      const_cast<TfLiteRegistrationExternal*>(registration_external);
+      const_cast<TfLiteOperator*>(registration_external);
   TfLiteRegistration* new_registration = new TfLiteRegistration{};
   InitTfLiteRegistration(new_registration, registration_external_non_const);
   return new_registration;
@@ -311,10 +309,10 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
     }
   }
 
-  // Try using newer RegistrationExternal API.
+  // Try using newer Operator API.
   if (op_resolver_callbacks_.find_builtin_op_external) {
-    // Get a RegistrationExternal object and create a Registration (V4) object.
-    const TfLiteRegistrationExternal* registration_external =
+    // Get a Operator object and create a Registration (V4) object.
+    const TfLiteOperator* registration_external =
         op_resolver_callbacks_.find_builtin_op_external(
             op_resolver_callbacks_.user_data,
             static_cast<TfLiteBuiltinOperator>(op), version);
@@ -325,7 +323,7 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(tflite::BuiltinOperator op,
          registration_external->prepare != nullptr ||
          registration_external->async_kernel != nullptr)) {
       TfLiteRegistration* new_registration =
-          RegistrationExternalToRegistration(registration_external);
+          OperatorToRegistration(registration_external);
       temporary_builtin_registrations_.push_back(
           std::unique_ptr<TfLiteRegistration>(new_registration));
       return new_registration;
@@ -375,8 +373,8 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(const char* op,
   }
 
   if (op_resolver_callbacks_.find_custom_op_external) {
-    // Get a RegistrationExternal object and create a Registration (V3) object.
-    const TfLiteRegistrationExternal* registration_external =
+    // Get a Operator object and create a Registration (V3) object.
+    const TfLiteOperator* registration_external =
         op_resolver_callbacks_.find_custom_op_external(
             op_resolver_callbacks_.user_data, op, version);
     if (registration_external && (registration_external->init != nullptr ||
@@ -384,7 +382,7 @@ const TfLiteRegistration* CallbackOpResolver::FindOp(const char* op,
                                   registration_external->invoke != nullptr ||
                                   registration_external->prepare != nullptr)) {
       TfLiteRegistration* new_registration =
-          RegistrationExternalToRegistration(registration_external);
+          OperatorToRegistration(registration_external);
       temporary_builtin_registrations_.push_back(
           std::unique_ptr<TfLiteRegistration>(new_registration));
       return new_registration;
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index f7504a315f1bff..41726d2f6dc284 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
-#include "tensorflow/lite/core/c/registration_external.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
 
 /// C API for TensorFlow Lite.
 ///
@@ -266,19 +266,18 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetErrorReporter(
 
 /// Adds an op registration to be applied during `TfLiteInterpreter` creation.
 ///
-/// The `TfLiteRegistrationExternal` object is needed to implement custom op of
+/// The `TfLiteOperator` object is needed to implement custom op of
 /// TFLite Interpreter via C API. Calling this function ensures that any
 /// `TfLiteInterpreter` created with the specified `options` can execute models
 /// that use the custom operator specified in `registration`.
 /// Please refer https://www.tensorflow.org/lite/guide/ops_custom for custom op
 /// support.
-/// \note The caller retains ownership of the TfLiteRegistrationExternal object
+/// \note The caller retains ownership of the TfLiteOperator object
 /// and should ensure that it remains valid for the duration of any created
 /// interpreter's lifetime.
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddRegistrationExternal(
-    TfLiteInterpreterOptions* options,
-    TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsAddOperator(
+    TfLiteInterpreterOptions* options, TfLiteOperator* registration);
 
 /// Enables users to cancel in-flight invocations with
 /// `TfLiteInterpreterCancel`.
diff --git a/tensorflow/lite/core/c/c_api_experimental.cc b/tensorflow/lite/core/c/c_api_experimental.cc
index 86348c28938b2d..d31d30bfee5033 100644
--- a/tensorflow/lite/core/c/c_api_experimental.cc
+++ b/tensorflow/lite/core/c/c_api_experimental.cc
@@ -75,11 +75,10 @@ void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
 
 void TfLiteInterpreterOptionsSetOpResolverExternal(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
-                                                         int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
-                                                        const char* custom_op,
-                                                        int version),
+    const TfLiteOperator* (*find_builtin_op)(void* user_data, int op,
+                                             int version),
+    const TfLiteOperator* (*find_custom_op)(void* user_data,
+                                            const char* custom_op, int version),
     void* op_resolver_user_data) {
   options->op_resolver_callbacks = {};  // Sets all fields to null.
   options->op_resolver_callbacks.find_builtin_op_external = find_builtin_op;
@@ -89,10 +88,11 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 
 void TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op_external)(
-        void* user_data, int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op_external)(
-        void* user_data, const char* custom_op, int version),
+    const TfLiteOperator* (*find_builtin_op_external)(void* user_data, int op,
+                                                      int version),
+    const TfLiteOperator* (*find_custom_op_external)(void* user_data,
+                                                     const char* custom_op,
+                                                     int version),
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
                                                  TfLiteBuiltinOperator op,
                                                  int version),
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
index c981c0fd1c7c1d..8037e967675f48 100644
--- a/tensorflow/lite/core/c/c_api_experimental.h
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -96,7 +96,7 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// The `TfLiteInterpreterOptionsSetOpResolverExternal` function provides an
 /// alternative method for registering builtin ops and/or custom ops, by
 /// providing operator resolver callbacks.  Unlike using
-/// `TfLiteInterpreterOptionsAddRegistrationExternal`,
+/// `TfLiteInterpreterOptionsAddOperator`,
 /// `TfLiteInterpreterOptionsAddBuiltinOp` and/or
 /// `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all the
 /// operators in a single call.
@@ -108,18 +108,17 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// If `op_resolver_user_data` is non-null, its lifetime must be at least as
 /// long as the lifetime of the `TfLiteInterpreterOptions`.
 ///
-/// The TfLiteRegistrationExternal objects whose addresses are returned by
+/// The TfLiteOperator objects whose addresses are returned by
 /// `find_builtin_op` and `find_custom_op` must outlive both the
 /// InterpreterOptions object and any Interpreter object created from it.
 ///
 /// WARNING: This is an experimental API and subject to change.
 void TfLiteInterpreterOptionsSetOpResolverExternal(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op)(void* user_data,
-                                                         int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op)(void* user_data,
-                                                        const char* custom_op,
-                                                        int version),
+    const TfLiteOperator* (*find_builtin_op)(void* user_data, int op,
+                                             int version),
+    const TfLiteOperator* (*find_custom_op)(void* user_data,
+                                            const char* custom_op, int version),
     void* op_resolver_user_data);
 
 /// \private
@@ -127,8 +126,8 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 ///
 /// This combines the effects of TfLiteInterpreterOptionsSetOpResolverExternal
 /// and TfLiteInterpreterOptionsSetOpResolver.  The callbacks that return
-/// TfLiteRegistrationExternal will be called first, but if they return a
-/// TfLiteRegistrationExternal object that has no methods set, then
+/// TfLiteOperator will be called first, but if they return a
+/// TfLiteOperator object that has no methods set, then
 /// the callbacks that return a TfLiteRegistration will be called to get
 /// the methods.
 ///
@@ -139,10 +138,11 @@ void TfLiteInterpreterOptionsSetOpResolverExternal(
 /// TF Lite itself.
 void TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
     TfLiteInterpreterOptions* options,
-    const TfLiteRegistrationExternal* (*find_builtin_op_external)(
-        void* user_data, int op, int version),
-    const TfLiteRegistrationExternal* (*find_custom_op_external)(
-        void* user_data, const char* custom_op, int version),
+    const TfLiteOperator* (*find_builtin_op_external)(void* user_data, int op,
+                                                      int version),
+    const TfLiteOperator* (*find_custom_op_external)(void* user_data,
+                                                     const char* custom_op,
+                                                     int version),
     const TfLiteRegistration* (*find_builtin_op)(void* user_data,
                                                  TfLiteBuiltinOperator op,
                                                  int version),
diff --git a/tensorflow/lite/core/c/c_api_experimental_test.cc b/tensorflow/lite/core/c/c_api_experimental_test.cc
index c52ad232b6a9e7..bb9fa71485a3d1 100644
--- a/tensorflow/lite/core/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/core/c/c_api_experimental_test.cc
@@ -50,10 +50,10 @@ const TfLiteRegistration* GetNoOpRegistration() {
   return &registration;
 }
 
-const TfLiteRegistrationExternal* GetNoOpRegistrationExternal() {
-  static TfLiteRegistrationExternal* registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "NoOp", 1);
-  TfLiteRegistrationExternalSetInvoke(
+const TfLiteOperator* GetNoOpOperator() {
+  static TfLiteOperator* registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinCustom, "NoOp", 1);
+  TfLiteOperatorSetInvoke(
       registration,
       /*invoke=*/[](TfLiteOpaqueContext*, TfLiteOpaqueNode*) {
         return kTfLiteOk;
@@ -188,21 +188,20 @@ TEST(CApiExperimentalTest, SetOpResolver) {
   TfLiteModelDelete(model);
 }
 
-const TfLiteRegistrationExternal* MyFindBuiltinOpExternal(void* user_data,
-                                                          int op, int version) {
+const TfLiteOperator* MyFindBuiltinOpExternal(void* user_data, int op,
+                                              int version) {
   OpResolverData* my_data = static_cast<OpResolverData*>(user_data);
   if (op == kTfLiteBuiltinAdd && version == 1) {
     my_data->called_for_add = true;
-    return GetNoOpRegistrationExternal();
+    return GetNoOpOperator();
   }
   return nullptr;
 }
 
-const TfLiteRegistrationExternal* MyFindCustomOpExternal(void*,
-                                                         const char* custom_op,
-                                                         int version) {
+const TfLiteOperator* MyFindCustomOpExternal(void*, const char* custom_op,
+                                             int version) {
   if (absl::string_view(custom_op) == "foo" && version == 1) {
-    return GetNoOpRegistrationExternal();
+    return GetNoOpOperator();
   }
   return nullptr;
 }
@@ -244,14 +243,14 @@ TfLiteStatus SinhEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-const TfLiteRegistrationExternal* SinhFindCustomOpExternal(
-    void*, const char* custom_op, int version) {
+const TfLiteOperator* SinhFindCustomOpExternal(void*, const char* custom_op,
+                                               int version) {
   if (absl::string_view(custom_op) == "Sinh" && version == 1) {
-    static TfLiteRegistrationExternal* registration = []() {
-      TfLiteRegistrationExternal* reg =
-          TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-      TfLiteRegistrationExternalSetPrepare(reg, &SinhPrepareOpaque);
-      TfLiteRegistrationExternalSetInvoke(reg, &SinhEvalOpaque);
+    static TfLiteOperator* registration = []() {
+      TfLiteOperator* reg =
+          TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+      TfLiteOperatorSetPrepare(reg, &SinhPrepareOpaque);
+      TfLiteOperatorSetInvoke(reg, &SinhEvalOpaque);
       return reg;
     }();
     return registration;
@@ -301,7 +300,7 @@ TEST(CApiExperimentalTest, SetOpResolverExternal) {
 // Test using TfLiteInterpreterOptionsSetOpResolverExternalWithFallback and
 // TfLiteInterpreterCreateWithSelectedOps, for a builtin op, for the normal
 // case where the op is found with the primary op resolver callback that returns
-// a TfLiteRegistrationExternal pointer.
+// a TfLiteOperator pointer.
 TEST(CApiExperimentalTest,
      SetOpResolverExternalWithFallback_BuiltinOp_NormalCase) {
   TfLiteModel* model =
@@ -348,10 +347,11 @@ TEST(CApiExperimentalTest,
   OpResolverData my_data;
   TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
       options,
-      [](void* user_data, int op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+      [](void* user_data, int op, int version) -> const TfLiteOperator* {
+        return nullptr;
+      },
       [](void* user_data, const char* custom_op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+         int version) -> const TfLiteOperator* { return nullptr; },
       MyFindBuiltinOp, MyFindCustomOp, &my_data);
   EXPECT_FALSE(my_data.called_for_add);
 
@@ -371,7 +371,7 @@ TEST(CApiExperimentalTest,
 // Test using TfLiteInterpreterOptionsSetOpResolverExternalWithFallback and
 // TfLiteInterpreterCreateWithSelectedOps, for a custom op, for the normal
 // case where the op is found with the primary op resolver callback that returns
-// a TfLiteRegistrationExternal pointer.
+// a TfLiteOperator pointer.
 TEST(CApiExperimentalTest,
      SetOpResolverExternalWithFallback_CustomOp_NormalCase) {
   TfLiteModel* model = TfLiteModelCreateFromFile(
@@ -427,10 +427,11 @@ TEST(CApiExperimentalTest,
   OpResolverData my_data;
   TfLiteInterpreterOptionsSetOpResolverExternalWithFallback(
       options,
-      [](void* user_data, int op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+      [](void* user_data, int op, int version) -> const TfLiteOperator* {
+        return nullptr;
+      },
       [](void* user_data, const char* custom_op,
-         int version) -> const TfLiteRegistrationExternal* { return nullptr; },
+         int version) -> const TfLiteOperator* { return nullptr; },
       MyFindBuiltinOp, SinhFindCustomOp, &my_data);
   EXPECT_FALSE(my_data.called_for_add);
 
@@ -654,16 +655,16 @@ struct OpaqueTestDelegate {
     delegate_state->buffer_handle++;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteRegistrationExternalCreate(
+    registration.registration_external = TfLiteOperatorCreate(
         kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel",
         /* version = */ 1);
 
-    TfLiteRegistrationExternalSetPrepare(
+    TfLiteOperatorSetPrepare(
         registration.registration_external,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
 
-    TfLiteRegistrationExternalSetInvoke(
+    TfLiteOperatorSetInvoke(
         registration.registration_external,
         [](TfLiteOpaqueContext*, TfLiteOpaqueNode*) -> TfLiteStatus {
           return kTfLiteOk;
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 6cf4ce4c6b1656..ce05ee54fffa31 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -402,8 +402,7 @@ TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 
 TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
     struct TfLiteOpaqueContext* opaque_context, int node_index,
-    TfLiteOpaqueNode** node,
-    TfLiteRegistrationExternal** registration_external) {
+    TfLiteOpaqueNode** node, TfLiteOperator** registration_external) {
   // The following casts are safe only because this code is part of the
   // TF Lite runtime implementation.  Apps using TF Lite should not rely on
   // TfLiteOpaqueContext and TfLiteContext being equivalent, or on
@@ -427,10 +426,10 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 
   // When the 'registration' object obtained via 'GetNodeAndRegistration'
   // does *not* have its 'registration_external' field set then we need to
-  // create a TfLiteRegistrationExternal on the fly, and set its field according
+  // create a TfLiteOperator on the fly, and set its field according
   // to the 'TfLiteRegistration' object.
   auto derived_registration =
-      tflite::internal::CommonOpaqueConversionUtil::ObtainRegistrationExternal(
+      tflite::internal::CommonOpaqueConversionUtil::ObtainOperator(
           context, registration, node_index);
 
   if (derived_registration == nullptr) return kTfLiteError;
@@ -441,7 +440,7 @@ TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 
 TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
-    TfLiteRegistrationExternal* registration_external,
+    TfLiteOperator* registration_external,
     const TfLiteIntArray* nodes_to_replace,
     TfLiteOpaqueDelegate* opaque_delegate) {
   // The following casts are safe only because this code is part of the
@@ -454,7 +453,7 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
 
   // Wrap the provided 'registration_external' as a regular 'TfLiteRegistration'
   // object to reduce the places in the TF Lite runtime that need to be aware
-  // of 'TfLiteRegistrationExternal's.  Note that it is important to
+  // of 'TfLiteOperator's.  Note that it is important to
   // brace-initialize the 'TfLiteRegistration' so that we pass a registration to
   // 'ReplaceNodeSubsetsWithDelegateKernels' that has all of its fields set to
   // null, except the 'registration_external' one.
diff --git a/tensorflow/lite/core/c/c_api_opaque.h b/tensorflow/lite/core/c/c_api_opaque.h
index afeb281f0a2a5b..751e074ed7eac7 100644
--- a/tensorflow/lite/core/c/c_api_opaque.h
+++ b/tensorflow/lite/core/c/c_api_opaque.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/operator.h"  // IWYU pragma: export
 
 #ifdef __cplusplus
 extern "C" {
@@ -288,7 +289,7 @@ TFL_CAPI_EXPORT int TfLiteOpaqueNodeNumberOfOutputs(
 
 /// Returns opaque data provided by the node implementer. The value returned
 /// from this function is the value that was returned from the `init` callback
-/// that was passed to `TfLiteRegistrationExternalSetInit`.
+/// that was passed to `TfLiteOperatorSetInit`.
 TFL_CAPI_EXPORT extern void* TfLiteOpaqueNodeGetUserData(
     const TfLiteOpaqueNode* opaque_node);
 
@@ -416,7 +417,7 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 ///
 /// This function is expected to be called from within a delegate callback, like
 /// 'Prepare', or a delegate kernel callback (i.e., a callback registered with
-/// a 'TfLiteRegistrationExternal' object).
+/// a 'TfLiteOperator' object).
 ///
 /// The loaded '*node' and '*registration_external' pointers will generally
 /// remain valid for the lifetime of the associated 'opaque_context', but can be
@@ -428,8 +429,7 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteOpaqueContextGetExecutionPlan(
 // are returned to the users and which actions invalidate them.
 TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
     struct TfLiteOpaqueContext* opaque_context, int node_index,
-    TfLiteOpaqueNode** node,
-    TfLiteRegistrationExternal** registration_external);
+    TfLiteOpaqueNode** node, TfLiteOperator** registration_external);
 
 /// Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
 ///
@@ -446,7 +446,7 @@ TFL_CAPI_EXPORT TfLiteStatus TfLiteOpaqueContextGetNodeAndRegistration(
 TFL_CAPI_EXPORT TfLiteStatus
 TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
     struct TfLiteOpaqueContext* opaque_context,
-    TfLiteRegistrationExternal* registration_external,
+    TfLiteOperator* registration_external,
     const TfLiteIntArray* nodes_to_replace,
     TfLiteOpaqueDelegate* opaque_delegate);
 
diff --git a/tensorflow/lite/core/c/c_api_opaque_test.cc b/tensorflow/lite/core/c/c_api_opaque_test.cc
index 5cdf40ac55faab..14c9fd27c50923 100644
--- a/tensorflow/lite/core/c/c_api_opaque_test.cc
+++ b/tensorflow/lite/core/c/c_api_opaque_test.cc
@@ -578,15 +578,14 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteRegistrationExternal* reg =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-  TfLiteRegistrationExternalSetPrepare(reg, my_custom_op::Prepare);
-  TfLiteRegistrationExternalSetInit(reg, my_custom_op::Init);
-  TfLiteRegistrationExternalSetFree(reg, my_custom_op::Free);
-  TfLiteRegistrationExternalSetInvoke(reg, my_custom_op::Invoke);
+  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+  TfLiteOperatorSetPrepare(reg, my_custom_op::Prepare);
+  TfLiteOperatorSetInit(reg, my_custom_op::Init);
+  TfLiteOperatorSetFree(reg, my_custom_op::Free);
+  TfLiteOperatorSetInvoke(reg, my_custom_op::Invoke);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddRegistrationExternal(options, reg);
+  TfLiteInterpreterOptionsAddOperator(options, reg);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
@@ -606,7 +605,7 @@ TEST(TestTfLiteOpaqueNode, CustomOpWithSetAndGetTemporaries) {
   EXPECT_EQ(output_value, input_value);
 
   TfLiteInterpreterDelete(interpreter);
-  TfLiteRegistrationExternalDelete(reg);
+  TfLiteOperatorDelete(reg);
   TfLiteModelDelete(model);
 }
 
diff --git a/tensorflow/lite/core/c/c_api_test.cc b/tensorflow/lite/core/c/c_api_test.cc
index 13caecd88a5c19..aaeecfbee21e0e 100644
--- a/tensorflow/lite/core/c/c_api_test.cc
+++ b/tensorflow/lite/core/c/c_api_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
@@ -444,11 +445,10 @@ struct OpState {
 std::vector<int>* g_nodes_to_replace;
 TfLiteOpaqueDelegate* g_opaque_delegate_struct;
 
-TfLiteRegistrationExternal* CreateDelegateKernelExternalRegistration() {
-  TfLiteRegistrationExternal* delegate_kernel_registration_external =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                       "TEST DELEGATE KERNEL", /*version=*/1);
-  TfLiteRegistrationExternalSetInit(
+TfLiteOperator* CreateDelegateKernelExternalRegistration() {
+  TfLiteOperator* delegate_kernel_registration_external = TfLiteOperatorCreate(
+      kTfLiteBuiltinDelegate, "TEST DELEGATE KERNEL", /*version=*/1);
+  TfLiteOperatorSetInit(
       delegate_kernel_registration_external,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -470,11 +470,10 @@ TfLiteRegistrationExternal* CreateDelegateKernelExternalRegistration() {
         }
         return new OpState{true};
       });
-  TfLiteRegistrationExternalSetFree(
-      delegate_kernel_registration_external,
-      [](TfLiteOpaqueContext* context, void* buffer) {
-        delete (reinterpret_cast<OpState*>(buffer));
-      });
+  TfLiteOperatorSetFree(delegate_kernel_registration_external,
+                        [](TfLiteOpaqueContext* context, void* buffer) {
+                          delete (reinterpret_cast<OpState*>(buffer));
+                        });
   return delegate_kernel_registration_external;
 }
 
@@ -499,7 +498,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
     EXPECT_EQ(execution_plan->size, 2);
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                               &registration_external);
     EXPECT_NE(node, nullptr);
@@ -538,8 +537,7 @@ TEST(CApiSimple, OpaqueDelegate_ReplaceNodeSubsetsWithDelegateKernels) {
   g_opaque_delegate_struct = nullptr;
 }
 
-TEST(CApiSimple,
-     OpaqueDelegate_TransferRegistrationExternalOwnershipWithoutNodeToReplace) {
+TEST(CApiSimple, OpaqueDelegate_TransferOperatorOwnershipWithoutNodeToReplace) {
   g_nodes_to_replace = new std::vector<int>();
 
   TfLiteModel* model =
@@ -556,7 +554,7 @@ TEST(CApiSimple,
     delegate_state->delegate_prepared = true;
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                               &registration_external);
     EXPECT_NE(node, nullptr);
@@ -615,13 +613,13 @@ TEST_F(TestFP16Delegation,
     std::vector<int> nodes_to_replace;
     for (int i = 0; i < execution_plan->size; i++) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context,
                                                 execution_plan->data[i], &node,
                                                 &registration_external);
       EXPECT_NE(node, nullptr);
       EXPECT_NE(registration_external, nullptr);
-      if (TfLiteRegistrationExternalGetBuiltInCode(registration_external) ==
+      if (TfLiteOperatorGetBuiltInCode(registration_external) ==
           kTfLiteBuiltinAdd) {
         nodes_to_replace.push_back(execution_plan->data[i]);
       }
@@ -754,10 +752,10 @@ struct DelegateKernelState {
   TfLiteOpaqueTensor* output_tensor = nullptr;
 };
 
-TfLiteRegistrationExternal* CreateReg() {
-  auto reg_ex = TfLiteRegistrationExternalCreate(
-      kTfLiteBuiltinDelegate, "Test driver delegate", /*version=*/1);
-  TfLiteRegistrationExternalSetInit(
+TfLiteOperator* CreateReg() {
+  auto reg_ex = TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                                     "Test driver delegate", /*version=*/1);
+  TfLiteOperatorSetInit(
       reg_ex,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -766,7 +764,7 @@ TfLiteRegistrationExternal* CreateReg() {
 
         for (int i = 0; i < params->nodes_to_replace->size; ++i) {
           TfLiteOpaqueNode* node = nullptr;
-          TfLiteRegistrationExternal* registration_external = nullptr;
+          TfLiteOperator* registration_external = nullptr;
           TfLiteOpaqueContextGetNodeAndRegistration(
               context, params->nodes_to_replace->data[i], &node,
               &registration_external);
@@ -774,8 +772,8 @@ TfLiteRegistrationExternal* CreateReg() {
           EXPECT_NE(nullptr, registration_external);
           EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
           EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
-          EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                           registration_external));
+          EXPECT_EQ(kTfLiteBuiltinAdd,
+                    TfLiteOperatorGetBuiltInCode(registration_external));
         }
 
         TfLiteIntArray* input_tensors = params->input_tensors;
@@ -790,7 +788,7 @@ TfLiteRegistrationExternal* CreateReg() {
         return new DelegateKernelState{input_tensor, output_tensor};
       });
 
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       reg_ex,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -808,8 +806,7 @@ TfLiteRegistrationExternal* CreateReg() {
         return kTfLiteOk;
       });
 
-  TfLiteRegistrationExternalSetFree(reg_ex, [](TfLiteOpaqueContext* context,
-                                               void* data) {
+  TfLiteOperatorSetFree(reg_ex, [](TfLiteOpaqueContext* context, void* data) {
     DelegateKernelState* state = reinterpret_cast<DelegateKernelState*>(data);
     delete state;
   });
@@ -841,15 +838,15 @@ TEST(CApiSimple, OpaqueDelegate_TfLiteOpaqueTensorGet) {
     std::vector<int> node_ids_to_replace;
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, execution_plan->data[i], &node, &registration_external);
       EXPECT_NE(nullptr, node);
       EXPECT_NE(nullptr, registration_external);
       EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
       EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
-      EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                       registration_external));
+      EXPECT_EQ(kTfLiteBuiltinAdd,
+                TfLiteOperatorGetBuiltInCode(registration_external));
       node_ids_to_replace.push_back(execution_plan->data[i]);
     }
 
@@ -943,21 +940,21 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
 
     for (int i = 0; i < execution_plan->size; i++) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                                 &registration_external);
       EXPECT_NE(node, nullptr);
       EXPECT_NE(registration_external, nullptr);
-      EXPECT_EQ(kTfLiteBuiltinAdd, TfLiteRegistrationExternalGetBuiltInCode(
-                                       registration_external));
-      EXPECT_EQ(1, TfLiteRegistrationExternalGetVersion(registration_external));
+      EXPECT_EQ(kTfLiteBuiltinAdd,
+                TfLiteOperatorGetBuiltInCode(registration_external));
+      EXPECT_EQ(1, TfLiteOperatorGetVersion(registration_external));
       EXPECT_EQ(2, TfLiteOpaqueNodeNumberOfInputs(node));
       EXPECT_EQ(1, TfLiteOpaqueNodeNumberOfOutputs(node));
     }
 
     {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 0, &node,
                                                 &registration_external);
       EXPECT_EQ(1, TfLiteOpaqueNodeGetInputTensorIndex(node, 0));
@@ -979,7 +976,7 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
     }
     {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration_external = nullptr;
+      TfLiteOperator* registration_external = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, 1, &node,
                                                 &registration_external);
       EXPECT_EQ(0, TfLiteOpaqueNodeGetInputTensorIndex(node, 0));
@@ -1004,8 +1001,8 @@ TEST(CApiSimple, OpaqueContextGetNodeAndRegistration) {
   TfLiteOpaqueDelegateDelete(opaque_delegate);
 }
 
-TEST(CApiSimple, TfLiteRegistrationExternalGetVersionNullptr) {
-  EXPECT_EQ(TfLiteRegistrationExternalGetVersion(nullptr), -1);
+TEST(CApiSimple, TfLiteOperatorGetVersionNullptr) {
+  EXPECT_EQ(TfLiteOperatorGetVersion(nullptr), -1);
 }
 
 TEST(CApiSimple, TfLiteOpaqueContextResizeTensor) {
@@ -1148,18 +1145,17 @@ TEST(CApiSimple, CustomOpSupport) {
       "tensorflow/lite/testdata/custom_sinh.bin");
   ASSERT_NE(model, nullptr);
 
-  TfLiteRegistrationExternal* reg =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "Sinh", 1);
-  TfLiteRegistrationExternalSetPrepare(reg, &FlexSinhPrepare);
-  TfLiteRegistrationExternalSetInit(reg, &FlexSinhInit);
-  TfLiteRegistrationExternalSetFree(reg, &FlexSinhFree);
-  TfLiteRegistrationExternalSetInvoke(reg, &FlexSinhEval);
+  TfLiteOperator* reg = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "Sinh", 1);
+  TfLiteOperatorSetPrepare(reg, &FlexSinhPrepare);
+  TfLiteOperatorSetInit(reg, &FlexSinhInit);
+  TfLiteOperatorSetFree(reg, &FlexSinhFree);
+  TfLiteOperatorSetInvoke(reg, &FlexSinhEval);
 
-  const char* kCustomName = TfLiteRegistrationExternalGetCustomName(reg);
+  const char* kCustomName = TfLiteOperatorGetCustomName(reg);
   EXPECT_EQ("Sinh", kCustomName);
 
   TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
-  TfLiteInterpreterOptionsAddRegistrationExternal(options, reg);
+  TfLiteInterpreterOptionsAddOperator(options, reg);
 
   TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
 
@@ -1179,7 +1175,7 @@ TEST(CApiSimple, CustomOpSupport) {
 
   TfLiteInterpreterDelete(interpreter);
   TfLiteModelDelete(model);
-  TfLiteRegistrationExternalDelete(reg);
+  TfLiteOperatorDelete(reg);
 }
 
 const TfLiteRegistration* find_builtin_op_add(void* user_data,
@@ -1518,10 +1514,10 @@ TEST(CApiSimple, OpaqueApiAccessors) {
     // Define a delegate kernel that checks that the properties of the model
     // are accessible via the opaque API function.
     //
-    TfLiteRegistrationExternal* reg = TfLiteRegistrationExternalCreate(
-        kTfLiteBuiltinDelegate, "my delegate", 123);
-    EXPECT_EQ(123, TfLiteRegistrationExternalGetVersion(reg));
-    TfLiteRegistrationExternalSetInit(
+    TfLiteOperator* reg =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
+    EXPECT_EQ(123, TfLiteOperatorGetVersion(reg));
+    TfLiteOperatorSetInit(
         reg,
         [](TfLiteOpaqueContext* opaque_context, const char* buffer,
            size_t length) -> void* {
@@ -1719,13 +1715,13 @@ TEST(CApiSimple, OpaqueApiAccessors) {
             TfLiteOpaqueTensorBuilderDelete(builder);
           }
           TfLiteOpaqueNode* node = nullptr;
-          TfLiteRegistrationExternal* registration_external = nullptr;
+          TfLiteOperator* registration_external = nullptr;
           TfLiteOpaqueContextGetNodeAndRegistration(
               opaque_context, params->nodes_to_replace->data[0], &node,
               &registration_external);
           // ADD is a builtin OP, not a custom OP.
           const char* kCustomName =
-              TfLiteRegistrationExternalGetCustomName(registration_external);
+              TfLiteOperatorGetCustomName(registration_external);
           EXPECT_EQ(nullptr, kCustomName);
 
           const void* node_custom_init_data = nullptr;
@@ -1840,9 +1836,9 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* context,
                                        TfLiteOpaqueDelegate* delegate,
                                        void* data) -> TfLiteStatus {
-    TfLiteRegistrationExternal* registration = TfLiteRegistrationExternalCreate(
-        kTfLiteBuiltinDelegate, "my delegate", 123);
-    TfLiteRegistrationExternalSetInit(
+    TfLiteOperator* registration =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "my delegate", 123);
+    TfLiteOperatorSetInit(
         registration,
         [](TfLiteOpaqueContext* opaque_context, const char* buffer,
            size_t length) -> void* {
@@ -1863,12 +1859,12 @@ TEST(CApiSimple, OpaqueApiAccessorsStrings) {
           return nullptr;
         });
 
-    TfLiteRegistrationExternalSetPrepare(
+    TfLiteOperatorSetPrepare(
         registration,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus { return kTfLiteOk; });
 
-    TfLiteRegistrationExternalSetInvoke(
+    TfLiteOperatorSetInvoke(
         registration,
         [](TfLiteOpaqueContext* context,
            TfLiteOpaqueNode* node) -> TfLiteStatus {
@@ -2059,7 +2055,7 @@ TEST(CApiSimple, AddNodesAfterApplyingDelegate) {
     EXPECT_EQ(execution_plan->size, 1);
 
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration_external = nullptr;
+    TfLiteOperator* registration_external = nullptr;
     TfLiteOpaqueContextGetNodeAndRegistration(context, execution_plan->data[0],
                                               &node, &registration_external);
     EXPECT_NE(node, nullptr);
diff --git a/tensorflow/lite/core/c/c_api_types.h b/tensorflow/lite/core/c/c_api_types.h
index 1170025cbab9a2..32cefa839f4452 100644
--- a/tensorflow/lite/core/c/c_api_types.h
+++ b/tensorflow/lite/core/c/c_api_types.h
@@ -133,6 +133,7 @@ typedef enum {
   kTfLiteUInt32 = 16,
   kTfLiteUInt16 = 17,
   kTfLiteInt4 = 18,
+  kTfLiteBFloat16 = 19,
 } TfLiteType;
 
 /// Legacy. Will be deprecated in favor of `TfLiteAffineQuantization`.
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index fd7c415f96e634..7afecdbe885199 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -370,6 +370,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "STRING";
     case kTfLiteFloat16:
       return "FLOAT16";
+    case kTfLiteBFloat16:
+      return "BFLOAT16";
     case kTfLiteFloat64:
       return "FLOAT64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 4e4890164d3aa6..ea54be9490ef01 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -296,6 +296,13 @@ typedef struct TfLiteFloat16 {
   uint16_t data;
 } TfLiteFloat16;
 
+/// bfloat16 data type compatible with the Google Brain definition.
+/// https://cloud.google.com/tpu/docs/bfloat16.
+/// This provides 1 bit of sign, 8 bits of exponent, and 7 bits of mantissa.
+typedef struct TfLiteBFloat16 {
+  uint16_t data;
+} TfLiteBFloat16;
+
 /// Return the name of a given type, for error reporting purposes.
 const char* TfLiteTypeGetName(TfLiteType type);
 
@@ -1007,11 +1014,17 @@ typedef struct TfLiteContext {
                                          int subgraph_index);
 } TfLiteContext;
 
-/// `TfLiteRegistrationExternal` is an external version of `TfLiteRegistration`
+/// `TfLiteOperator` is an external version of `TfLiteRegistration`
 /// for C API which doesn't use internal types (such as `TfLiteContext`) but
 /// only uses stable API types (such as `TfLiteOpaqueContext`). The purpose of
 /// each field is the exactly the same as with `TfLiteRegistration`.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+typedef struct TfLiteOperator TfLiteOperator;
+
+#ifndef DOXYGEN_SKIP
+// For backwards compatibility.
+// Deprecated. Use TfLiteOperator instead.
+typedef TfLiteOperator TfLiteRegistrationExternal;
+#endif
 
 /// The valid values of the `inplace_operator` field in `TfLiteRegistration`.
 /// This allow an op to signal to the runtime that the same data pointer
@@ -1078,7 +1091,7 @@ static const int kTfLiteMaxSharableOpInputs = 3;
 /// It is a struct containing "methods" (C function pointers) that will be
 /// invoked by the TF Lite runtime to evaluate instances of the operation.
 ///
-/// See also `TfLiteRegistrationExternal` which is a more ABI-stable equivalent.
+/// See also `TfLiteOperator` which is a more ABI-stable equivalent.
 typedef struct TfLiteRegistration {
   /// Initializes the op from serialized data.
   /// Called only *once* for the lifetime of the op, so any one-time allocations
@@ -1149,12 +1162,12 @@ typedef struct TfLiteRegistration {
   /// properly.
   int version;
 
-  /// The external version of `TfLiteRegistration`. Since we can't use internal
-  /// types (such as `TfLiteContext`) for C API to maintain ABI stability.
-  /// C API user will provide `TfLiteRegistrationExternal` to implement custom
-  /// ops. We keep it inside of `TfLiteRegistration` and use it to route
-  /// callbacks properly.
-  TfLiteRegistrationExternal* registration_external;
+  /// The external (i.e. ABI-stable) version of `TfLiteRegistration`.
+  /// Since we can't use internal types (such as `TfLiteContext`) for C API to
+  /// maintain ABI stability.  C API user will provide `TfLiteOperator` to
+  /// implement custom ops.  We keep it inside of `TfLiteRegistration` and use
+  /// it to route callbacks properly.
+  TfLiteOperator* registration_external;
 
   /// Retrieves asynchronous kernel.
   ///
@@ -1194,7 +1207,7 @@ typedef struct TfLiteRegistration_V3 {
   int32_t builtin_code;
   const char* custom_name;
   int version;
-  TfLiteRegistrationExternal* registration_external;
+  TfLiteOperator* registration_external;
   struct TfLiteAsyncKernel* (*async_kernel)(TfLiteContext* context,
                                             TfLiteNode* node);
 } TfLiteRegistration_V3;
@@ -1220,7 +1233,7 @@ typedef struct TfLiteRegistration_V2 {
   int32_t builtin_code;
   const char* custom_name;
   int version;
-  TfLiteRegistrationExternal* registration_external;
+  TfLiteOperator* registration_external;
 } TfLiteRegistration_V2;
 
 /// \private
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index d2bc137378656e..58fd8654d8b171 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -107,6 +107,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteFloat64), "FLOAT64");
   EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
   EXPECT_EQ(type_name(kTfLiteFloat16), "FLOAT16");
+  EXPECT_EQ(type_name(kTfLiteBFloat16), "BFLOAT16");
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteUInt16), "UINT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
diff --git a/tensorflow/lite/core/c/operator.cc b/tensorflow/lite/core/c/operator.cc
new file mode 100644
index 00000000000000..f4571fe2b2615d
--- /dev/null
+++ b/tensorflow/lite/core/c/operator.cc
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/c/operator.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common_internal.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+TfLiteOperator* TfLiteOperatorCreate(TfLiteBuiltinOperator builtin_code,
+                                     const char* custom_name, int version) {
+  return new TfLiteOperator{/*.custom_name =*/custom_name,
+                            /*.version =*/version,
+                            /*.init =*/nullptr,
+                            /*.free =*/nullptr,
+                            /*.prepare =*/nullptr,
+                            /*.invoke =*/nullptr,
+                            /*.async_kernel =*/nullptr,
+                            /*.builtin_code =*/builtin_code,
+                            /*.node_index =*/-1};
+}
+
+void TfLiteOperatorDelete(TfLiteOperator* reg) { delete reg; }
+
+void TfLiteOperatorSetInit(TfLiteOperator* registration,
+                           void* (*init)(TfLiteOpaqueContext* context,
+                                         const char* buffer, size_t length)) {
+  registration->init = init;
+}
+
+void TfLiteOperatorSetFree(TfLiteOperator* registration,
+                           void (*free)(TfLiteOpaqueContext* context,
+                                        void* data)) {
+  registration->free = free;
+}
+
+void TfLiteOperatorSetPrepare(
+    TfLiteOperator* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node)) {
+  registration->prepare = prepare;
+}
+
+void TfLiteOperatorSetInvoke(
+    TfLiteOperator* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node)) {
+  registration->invoke = invoke;
+}
+
+void TfLiteOperatorSetAsyncKernel(
+    TfLiteOperator* registration,
+    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node)) {
+  registration->async_kernel = async_kernel;
+}
+
+void TfLiteOperatorSetInplaceOperator(TfLiteOperator* registration,
+                                      uint64_t inplace_operator) {
+  registration->inplace_operator = inplace_operator;
+}
+
+TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
+    const TfLiteOperator* registration) {
+  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
+}
+
+const char* TfLiteOperatorGetCustomName(const TfLiteOperator* registration) {
+  return registration->custom_name;
+}
+
+int TfLiteOperatorGetVersion(const TfLiteOperator* registration) {
+  if (!registration) {
+    return -1;
+  }
+  return registration->version;
+}
diff --git a/tensorflow/lite/core/c/registration_external.h b/tensorflow/lite/core/c/operator.h
similarity index 73%
rename from tensorflow/lite/core/c/registration_external.h
rename to tensorflow/lite/core/c/operator.h
index d637e6f8cfcdaa..796b0a23d8cb98 100644
--- a/tensorflow/lite/core/c/registration_external.h
+++ b/tensorflow/lite/core/c/operator.h
@@ -17,14 +17,14 @@ limitations under the License.
 /// Only the TensorFlow Lite implementation itself should include this
 /// file directly.
 ///
-/// The types and functions declared in registration_external.h are
+/// The types and functions declared in operator.h are
 /// part of the TensorFlow Lite Extension APIs.
 /// We reserve the right to make changes to this API in future releases,
 /// potentially including non-backwards-compatible changes, on a different
 /// schedule than for the other TensorFlow Lite APIs. See
 /// https://www.tensorflow.org/guide/versions#separate_version_number_for_tensorflow_lite_extension_apis.
-#ifndef TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
-#define TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+#ifndef TENSORFLOW_LITE_CORE_C_OPERATOR_H_
+#define TENSORFLOW_LITE_CORE_C_OPERATOR_H_
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -37,31 +37,31 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// TfLiteRegistrationExternal is an opaque version of TfLiteRegistration,
+// TfLiteOperator is an opaque version of TfLiteRegistration,
 // and is used for registering custom ops.  It represents a definition of a
 // custom op or a builtin op.
 //
 // \warning This is an experimental type and subject to change.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+typedef struct TfLiteOperator TfLiteOperator;
 
-// Returns a new TfLiteRegistrationExternal instance.
+// Returns a new TfLiteOperator instance.
 //
-// The returned TfLiteRegistrationExternal instance represents a definition
+// The returned TfLiteOperator instance represents a definition
 // of an operator with the identity (builtin_code/custom_name and
 // version) specified by the parameters, but with all callbacks initially unset.
 //
 // Evaluation of any operation using this operator will be done using
 // the "prepare" and "invoke" callbacks, which can be set using
-// `TfLiteRegistrationExternalSetPrepare` and
-// `TfLiteRegistrationExternalSetInvoke`, or for async execution
+// `TfLiteOperatorSetPrepare` and
+// `TfLiteOperatorSetInvoke`, or for async execution
 // the "prepare", "eval", and "wait" callbacks of the `TfLiteAsyncKernel`,
-// which can be set using `TfLiteRegistrationExternalSetAsyncKernel`.
+// which can be set using `TfLiteOperatorSetAsyncKernel`.
 // If the relevant callbacks are not set, then such evaluation will result
 // in an error status.  So normally any use of this function should be followed
 // by appropriate calls to set those callbacks.
 //
 // \note The caller retains ownership and should ensure that
-// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
+// the lifetime of the `TfLiteOperator` must be at least as long as
 // the lifetime of any `TfLiteInterpreter` or `tflite::Interpreter` that it is
 // used in.
 //
@@ -74,36 +74,33 @@ typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
 //                     https://www.tensorflow.org/lite/guide/ops_version
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteRegistrationExternal*
-TfLiteRegistrationExternalCreate(TfLiteBuiltinOperator builtin_code,
-                                 const char* custom_name, int version);
+TFL_CAPI_EXPORT extern TfLiteOperator* TfLiteOperatorCreate(
+    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version);
 
-// Destroys the TfLiteRegistrationExternal instance.
+// Destroys the TfLiteOperator instance.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalDelete(
-    TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern void TfLiteOperatorDelete(TfLiteOperator* registration);
 
 // Return the builtin op code of the provided external 'registration'.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteBuiltinOperator
-TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern TfLiteBuiltinOperator TfLiteOperatorGetBuiltInCode(
+    const TfLiteOperator* registration);
 
 /// Returns the custom name of the provided 'registration'. The returned pointer
 /// will be non-null iff the op is a custom op.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern const char* TfLiteOperatorGetCustomName(
+    const TfLiteOperator* registration);
 
 /// Return the OP version of the provided external 'registration'.  Return -1
 /// in case of error, or if the provided address is null.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration);
+TFL_CAPI_EXPORT extern int TfLiteOperatorGetVersion(
+    const TfLiteOperator* registration);
 
 // Sets the initialization callback for the registration.
 //
@@ -111,8 +108,8 @@ TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
 // Please refer `init` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInit(
+    TfLiteOperator* registration,
     void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                   size_t length));
 
@@ -124,8 +121,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
 // Please refer `free` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetFree(
+    TfLiteOperator* registration,
     void (*free)(TfLiteOpaqueContext* context, void* data));
 
 // Sets the preparation callback for the registration.
@@ -134,8 +131,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
 // Please refer `prepare` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetPrepare(
+    TfLiteOperator* registration,
     TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
                             TfLiteOpaqueNode* node));
 
@@ -145,8 +142,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
 // Please refer `invoke` of `TfLiteRegistration` for the detail.
 //
 // \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInvoke(
+    TfLiteOperator* registration,
     TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
                            TfLiteOpaqueNode* node));
 
@@ -158,8 +155,8 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
 /// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
 /// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetAsyncKernel(
+    TfLiteOperator* registration,
     struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
                                               TfLiteOpaqueNode* node));
 
@@ -169,11 +166,11 @@ TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
 /// `TfLiteRegistration` for details.
 ///
 /// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInplaceOperator(
-    TfLiteRegistrationExternal* registration, uint64_t inplace_operator);
+TFL_CAPI_EXPORT extern void TfLiteOperatorSetInplaceOperator(
+    TfLiteOperator* registration, uint64_t inplace_operator);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+#endif  // TENSORFLOW_LITE_CORE_C_OPERATOR_H_
diff --git a/tensorflow/lite/core/c/registration_external.cc b/tensorflow/lite/core/c/registration_external.cc
deleted file mode 100644
index 4f5a0daddf81d7..00000000000000
--- a/tensorflow/lite/core/c/registration_external.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/core/c/registration_external.h"
-
-#include "tensorflow/lite/c/common_internal.h"
-
-TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
-    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version) {
-  return new TfLiteRegistrationExternal{/*.custom_name =*/custom_name,
-                                        /*.version =*/version,
-                                        /*.init =*/nullptr,
-                                        /*.free =*/nullptr,
-                                        /*.prepare =*/nullptr,
-                                        /*.invoke =*/nullptr,
-                                        /*.async_kernel =*/nullptr,
-                                        /*.builtin_code =*/builtin_code,
-                                        /*.node_index =*/-1};
-}
-
-void TfLiteRegistrationExternalDelete(TfLiteRegistrationExternal* reg) {
-  delete reg;
-}
-
-void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
-    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
-                  size_t length)) {
-  registration->init = init;
-}
-
-void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
-    void (*free)(TfLiteOpaqueContext* context, void* data)) {
-  registration->free = free;
-}
-
-void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
-                            TfLiteOpaqueNode* node)) {
-  registration->prepare = prepare;
-}
-
-void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
-                           TfLiteOpaqueNode* node)) {
-  registration->invoke = invoke;
-}
-
-void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
-    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
-                                       TfLiteOpaqueNode* node)) {
-  registration->async_kernel = async_kernel;
-}
-
-void TfLiteRegistrationExternalSetInplaceOperator(
-    TfLiteRegistrationExternal* registration, uint64_t inplace_operator) {
-  registration->inplace_operator = inplace_operator;
-}
-
-TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration) {
-  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
-}
-
-const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration) {
-  return registration->custom_name;
-}
-
-int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration) {
-  if (!registration) {
-    return -1;
-  }
-  return registration->version;
-}
diff --git a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
index 3f5cb1a87f52c8..55f376948b2eec 100644
--- a/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/core/experimental/acceleration/configuration/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite/core:special_rules.bzl", "delegate_registry_visibility_allowlist")
 load("//tensorflow/lite/core/c:special_rules.bzl", "experimental_acceleration_api_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index f200394d6123de..fcb5a458edcbce 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite/core:special_rules.bzl", "builtin_ops_visibility_allowlist")
 
 package(
diff --git a/tensorflow/lite/core/shims/BUILD b/tensorflow/lite/core/shims/BUILD
index 902a2a6d862753..7febb33af081ed 100644
--- a/tensorflow/lite/core/shims/BUILD
+++ b/tensorflow/lite/core/shims/BUILD
@@ -1,9 +1,9 @@
 # Description: this package contains shim library targets that forward
 # to the TF Lite C and C++ API targets.  See README.md.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(":build_defs.bzl", "build_test")
 load(":cc_library_with_tflite.bzl", "custom_c_library_with_tflite")
 load(":cc_library_with_tflite_test.bzl", "cc_library_with_tflite_test_suite")
diff --git a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
index 7ed021d9d5c9d6..a328f8e61ddafe 100644
--- a/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
+++ b/tensorflow/lite/core/shims/cc_library_with_tflite.bzl
@@ -1,5 +1,7 @@
 """Definitions for targets that use the TFLite shims."""
 
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
 load("//tensorflow:tensorflow.bzl", "clean_dep")
 load(
     "//tensorflow/lite:build_def.bzl",
@@ -7,8 +9,6 @@ load(
     "tflite_custom_c_library",
     "tflite_jni_binary",
 )
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary", "android_library")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 def _concat(lists):
     """Concatenate a list of lists, without requiring the inner lists to be iterable.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 489b5d4e82aae3..a1a9f66721167f 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -253,7 +253,7 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
                    resource::InitializationStatusMap* initialization_status_map,
                    int subgraph_index)
     : external_contexts_(external_contexts),
-      registration_externals_(new internal::RegistrationExternalsCache),
+      registration_externals_(new internal::OperatorsCache),
       error_reporter_(error_reporter),
       next_execution_plan_index_to_prepare_(0),
       next_execution_plan_index_to_plan_allocation_(0),
@@ -469,10 +469,10 @@ void PopulatePreviewDelegateParams(const NodeSubset& node_subset,
 // a nested 'custom_name' field defined inside the optionally set
 // 'registration_external' structure.  The top-level field takes precedence over
 // the nested field.  'TfLiteRegistration'
-// objects can optionally carry a 'TfLiteRegistrationExternal' pointer in their
+// objects can optionally carry a 'TfLiteOperator' pointer in their
 // 'registration_external' field.  If that's the case then the
 // 'TfLiteRegistration' object is merely a wrapper over a
-// 'TfLiteRegistrationExternal', with all fields except 'registration_external'
+// 'TfLiteOperator', with all fields except 'registration_external'
 // being null, that contains the actual logic that the registration represents.
 // See also the comment inside
 // 'TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels'.
@@ -512,7 +512,7 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
   // user has supplied an opaque delegate.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
     // If the user has supplied an opaque delegate, then they _must_ also use
-    // TfLiteRegistrationExternal.
+    // TfLiteOperator.
     if (!registration.registration_external) {
       TFLITE_LOG(
           tflite::TFLITE_LOG_WARNING,
@@ -527,8 +527,8 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
                            registration.registration_external->custom_name,
                            registration.registration_external->version};
     auto [it, inserted] = registration_externals_->emplace(
-        op_id, std::unique_ptr<TfLiteRegistrationExternal>(
-                   registration.registration_external));
+        op_id,
+        std::unique_ptr<TfLiteOperator>(registration.registration_external));
     // If there was already an entry for this op_id in the
     // registration_externals_ cache, the statement above will have
     // no effect on the registration_externals_ cache,
@@ -1271,17 +1271,17 @@ TfLiteStatus Subgraph::ReleaseMemory() {
 void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
                        size_t length) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1302,17 +1302,17 @@ void* Subgraph::OpInit(const TfLiteRegistration& op_reg, const char* buffer,
 TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
                                  TfLiteNode* node) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1368,17 +1368,17 @@ TfLiteStatus Subgraph::OpPrepare(const TfLiteRegistration& op_reg,
 TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
                                 TfLiteNode* node) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1) {
     TfLiteRegistration* referenced_registration =
@@ -1401,17 +1401,17 @@ TfLiteStatus Subgraph::OpInvoke(const TfLiteRegistration& op_reg,
 // If registration_external is valid, use the 'free' callback from that.
 void Subgraph::OpFree(const TfLiteRegistration& op_reg, void* buffer) {
   // Delegates that use the stable delegate API to iterate over the nodes and
-  // registrations are presented with ABI stable 'TfLiteRegistrationExternal'
+  // registrations are presented with ABI stable 'TfLiteOperator'
   // pointers, as opposed to ABI unstable 'TfLiteRegistration' pointers, even
   // for builtin OPs like ADD or SUB.  A knock-on effect of this behavior is
   // that we need to differentiate two scenarios when interacting with a
-  // 'TfLiteRegistrationExternal'.
+  // 'TfLiteOperator'.
   // 1. In the 'wrapper' scenario described above we use the 'node_index' field
   //    that points us to the corresponding 'TfLiteRegistration' that holds the
   //    callbacks that need to be invoked.
-  // 2. Otherwise the 'TfLiteRegistrationExternal' is either a stable custom OP,
+  // 2. Otherwise the 'TfLiteOperator' is either a stable custom OP,
   //    or a stable delegate kernel, and in both of those cases we need to use
-  //    the callbacks stored within the 'TfLiteRegistrationExternal' itself.
+  //    the callbacks stored within the 'TfLiteOperator' itself.
   if (op_reg.registration_external &&
       op_reg.registration_external->node_index != -1 && buffer) {
     TfLiteRegistration* referenced_registration =
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d01cd1b899c000..ae1b1ca5670fda 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -957,27 +957,25 @@ class Subgraph {
   // sits inside the associated TFLite interpreter instance.
   TfLiteExternalContext** external_contexts_;
 
-  // A set of 'TfLiteRegistrationExternal' pointers that are owned by the
-  // subgraph.  The objects pointed to by the 'TfLiteRegistrationExternal'
+  // A set of 'TfLiteOperator' pointers that are owned by the
+  // subgraph.  The objects pointed to by the 'TfLiteOperator'
   // pointers are deleted in the 'Subgraph' destructor.
   //
   // The intended usage of this container is to provide (friend) classes
-  // the option to dynamically allocate 'TfLiteRegistrationExternal' objects
+  // the option to dynamically allocate 'TfLiteOperator' objects
   // and then tie the lifetime of these objects to a subgraph.
   //
   // WARNING: This field needs to precede 'nodes_and_registration_', to ensure
   // that it outlives that field, since that field might contain references to
-  // the TfLiteRegistrationExternal objects contained in this fielld.
+  // the TfLiteOperator objects contained in this fielld.
   //
   // LINT.IfChange
-  // The definition of RegistrationExternalsCache implicitly assumes that
-  // TfLiteRegistrationExternalDelete is the same as the standard C++ delete
-  // operator.
-  // TODO(b/238435088): in op_resolver, include registration_external.h and use
-  // 'TfLiteRegistrationExternalDelete' as the deleter, then we can eliminate
+  // The definition of OperatorsCache implicitly assumes that
+  // TfLiteOperatorDelete is the same as the standard C++ delete operator.
+  // TODO(b/238435088): in op_resolver, include operator.h and use
+  // 'TfLiteOperatorDelete' as the deleter, then we can eliminate
   // the IfChange...ThenChange directive below.
-  std::shared_ptr<::tflite::internal::RegistrationExternalsCache>
-      registration_externals_;
+  std::shared_ptr<::tflite::internal::OperatorsCache> registration_externals_;
   // LINT.ThenChange(//tensorflow/lite/core/c/c_api.cc)
 
   // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
diff --git a/tensorflow/lite/core/tools/BUILD b/tensorflow/lite/core/tools/BUILD
index 3e5363b809dd06..8891caef8f75de 100644
--- a/tensorflow/lite/core/tools/BUILD
+++ b/tensorflow/lite/core/tools/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/lite:special_rules.bzl", "verifier_internal_visibility_allowlist")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_self_contained_libs_test_suite")
+load("//tensorflow/lite:special_rules.bzl", "verifier_internal_visibility_allowlist")
 load("//tensorflow/lite/core:special_rules.bzl", "verifier_visibility_allowlist")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/core/tools/verifier.cc b/tensorflow/lite/core/tools/verifier.cc
index cdf8959d55483f..c878f7c392d14a 100644
--- a/tensorflow/lite/core/tools/verifier.cc
+++ b/tensorflow/lite/core/tools/verifier.cc
@@ -409,6 +409,9 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT16:
       bytes_required *= sizeof(uint16_t);
       break;
+    case TensorType_BFLOAT16:
+      bytes_required *= sizeof(uint16_t);
+      break;
     case TensorType_FLOAT64:
       bytes_required *= sizeof(double);
       break;
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8660884edb2b51..d0a207e1ff6879 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
@@ -168,12 +168,14 @@ cc_test(
         "TFLITE_USE_OPAQUE_DELEGATE",
     ],
     deps = [
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
@@ -192,12 +194,14 @@ cc_test(
         "TF_LITE_STRIP_ERROR_STRINGS",
     ],
     deps = [
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_experimental",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 560b2b4c65b940..b98c233912d1f8 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -447,7 +447,7 @@ struct OpaqueTestDelegate {
     delegate_state->delegate_prepared = true;
 
     TfLiteRegistration registration{};
-    registration.registration_external = TfLiteRegistrationExternalCreate(
+    registration.registration_external = TfLiteOperatorCreate(
         kTfLiteBuiltinDelegate, "OpaqueTestDelegate delegate kernel", 1);
 
     registration.prepare = [](TfLiteContext* context,
@@ -1130,10 +1130,10 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
   TfLiteDelegate delegate_;
 };
 
-TfLiteRegistrationExternal* CreateTfLiteRegistrationExternal() {
-  auto registration = TfLiteRegistrationExternalCreate(
-      kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
-  TfLiteRegistrationExternalSetPrepare(
+TfLiteOperator* CreateTfLiteOperator() {
+  auto registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, "OpaqueDelegateKernel", 1);
+  TfLiteOperatorSetPrepare(
       registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -1186,7 +1186,7 @@ class TestOpaqueDelegateBuilderWithDynamicTensors
       TfLiteIntArray* execution_plan;
       TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan);
       return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
-          opaque_context, CreateTfLiteRegistrationExternal(), execution_plan,
+          opaque_context, CreateTfLiteOperator(), execution_plan,
           opaque_delegate);
     };
     delegate_external_.flags = kTfLiteDelegateFlagsNone;
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 77a16ffa032865..da620e2e011019 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -7,10 +7,10 @@ load(
     "tf_opts_nortti_if_lite_protos",
     "tf_opts_nortti_if_mobile",
 )
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_cc_library", "tflite_flex_shared_library")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 default_visibility = [
     "//tensorflow/compiler/mlir/lite:__subpackages__",
@@ -322,6 +322,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
         "@com_google_absl//absl/strings:str_format",
@@ -341,6 +342,7 @@ tf_cc_test(
     srcs = ["util_test.cc"],
     deps = [
         ":util",
+        "//tensorflow/c:tf_datatype",
         "//tensorflow/core:framework",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "//tensorflow/lite:string",
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 44588f9abad0cc..bd62777f46c9dd 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -1,5 +1,6 @@
 """Generate custom flex delegate library."""
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "clean_dep",
@@ -22,7 +23,6 @@ load(
     "tflite_jni_linkopts",
 )
 load("//tensorflow/lite:special_rules.bzl", "flex_portable_tensorflow_deps")
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 def generate_flex_kernel_header(
         name,
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 2839ad526424c3..e0b0d1041e45b4 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -33,13 +33,11 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
-#include "tensorflow/lite/delegates/flex/subgraph_resource.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
@@ -159,8 +157,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
         const std::vector<std::unique_ptr<Subgraph>>&, std::set<std::string>*)>&
         select_subgraphs_to_register,
     tensorflow::ResourceMgr* resource_mgr,
-    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate,
-    tensorflow::mutex* mutex) {
+    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate) {
   std::vector<std::unique_ptr<Subgraph>>* subgraphs =
       main_subgraph.GetSubgraphs();
   if (!subgraphs) {
@@ -182,7 +179,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
     // This is to ensure that we only register FunctionDefs for subgraphs that
     // are used by TF ops to invoke functions.
     auto* subgraph_resource =
-        new TFLiteSubgraphResource(*(subgraphs->at(i)), flex_delegate, mutex);
+        new TFLiteSubgraphResource(*(subgraphs->at(i)), flex_delegate);
     TF_RETURN_IF_ERROR(resource_mgr->Create<TFLiteSubgraphResource>(
         "flex", subgraph_name, subgraph_resource));
     tensorflow::FunctionDef fdef;
@@ -235,7 +232,7 @@ tensorflow::Status DelegateData::Prepare(
     TF_RETURN_IF_ERROR(RegisterFunctionDefForSubgraphs(
         *main_subgraph, GetSubgraphNamesForFunctionExecution,
         eager_context_->HostCPU()->resource_manager(), eager_context_,
-        flex_delegate, &mutex_));
+        flex_delegate));
   }
   return tensorflow::Status();
 }
diff --git a/tensorflow/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
index 871b49bdc551a8..7cc06f527bca24 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -87,8 +87,6 @@ class DelegateData {
   // lifetime information.
   std::unordered_map<const TfLiteContext*, std::map<int, int>>
       tensor_release_map_;
-  // Shared mutex for accessing TFLite subgraphs.
-  tensorflow::mutex mutex_;
 };
 
 // Creates a `TFLiteSubgraphResource` for each subgraph (execpt
@@ -101,8 +99,7 @@ tensorflow::Status RegisterFunctionDefForSubgraphs(
         const std::vector<std::unique_ptr<Subgraph>>&,
         std::set<std::string>* result)>& select_subgraphs_to_register,
     tensorflow::ResourceMgr* resource_mgr,
-    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate,
-    tensorflow::mutex* mutex);
+    tensorflow::EagerContext* eager_context, TfLiteDelegate* flex_delegate);
 
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate_data_test.cc b/tensorflow/lite/delegates/flex/delegate_data_test.cc
index 70e6e3ee762173..058abe00cc66b5 100644
--- a/tensorflow/lite/delegates/flex/delegate_data_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -98,11 +97,10 @@ TEST(DelegateDataTest, CheckFunctionDef) {
                          /*resources=*/nullptr, /*resource_ids=*/nullptr,
                          /*initialization_status_map=*/nullptr);
   main_subgraph.SetName("main");
-  tensorflow::mutex mutex;
   TF_ASSERT_OK(RegisterFunctionDefForSubgraphs(
       main_subgraph, select_subgraphs_to_register,
       eager_context->HostCPU()->resource_manager(), eager_context,
-      /*flex_delegate=*/nullptr, &mutex));
+      /*flex_delegate=*/nullptr));
 
   const string add_fdef_txt = R"pb(
     signature {
@@ -230,11 +228,10 @@ TEST(DelegateDataTest, CheckFunctionDefWithOnlyMainGraph) {
                          /*resource_ids=*/nullptr,
                          /*initialization_status_map=*/nullptr);
   main_subgraph.SetName("main");
-  tensorflow::mutex mutex;
   TF_ASSERT_OK(RegisterFunctionDefForSubgraphs(
       main_subgraph, select_subgraphs_to_register,
       eager_context->HostCPU()->resource_manager(), eager_context,
-      /*flex_delegate=*/nullptr, &mutex));
+      /*flex_delegate=*/nullptr));
 
   EXPECT_EQ(eager_context->GetFunctionDef("main"), nullptr);
 
diff --git a/tensorflow/lite/delegates/flex/subgraph_resource.h b/tensorflow/lite/delegates/flex/subgraph_resource.h
index 1eca555353b80f..5b5734ecc055b0 100644
--- a/tensorflow/lite/delegates/flex/subgraph_resource.h
+++ b/tensorflow/lite/delegates/flex/subgraph_resource.h
@@ -32,9 +32,8 @@ namespace flex {
 // needs to first acquire a lock on the mutex object.
 class TFLiteSubgraphResource : public tensorflow::ResourceBase {
  public:
-  explicit TFLiteSubgraphResource(Subgraph& subgraph, TfLiteDelegate* delegate,
-                                  tensorflow::mutex* mutex)
-      : mutex_(mutex), subgraph_(subgraph), delegate_(delegate) {}
+  explicit TFLiteSubgraphResource(Subgraph& subgraph, TfLiteDelegate* delegate)
+      : subgraph_(subgraph), delegate_(delegate) {}
 
   std::string DebugString() const override { return "TFLiteSubgraphResource"; }
 
@@ -45,7 +44,7 @@ class TFLiteSubgraphResource : public tensorflow::ResourceBase {
   }
 
   tensorflow::mutex& GetExclusiveLock() TF_LOCK_RETURNED(mutex_) {
-    return *mutex_;
+    return mutex_;
   }
 
   // Returns a pointer to the TfLiteDelegate which this instance of subgraph
@@ -55,7 +54,7 @@ class TFLiteSubgraphResource : public tensorflow::ResourceBase {
   }
 
  private:
-  tensorflow::mutex* mutex_;
+  tensorflow::mutex mutex_;
   Subgraph& subgraph_ TF_GUARDED_BY(mutex_);
   TfLiteDelegate* delegate_ TF_GUARDED_BY(mutex_) = nullptr;
 };
diff --git a/tensorflow/lite/delegates/flex/test/BUILD b/tensorflow/lite/delegates/flex/test/BUILD
index 238b6adadd824c..65467a9a84f903 100644
--- a/tensorflow/lite/delegates/flex/test/BUILD
+++ b/tensorflow/lite/delegates/flex/test/BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_jni_library")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 # Following targets are using for testing selective-built flex delegate
 # in Java. Please don't use them for other purposes.
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index 8a115a4f33cf64..9940fadb8d7625 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/string_util.h"
 
@@ -74,6 +76,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_FLOAT;
     case kTfLiteFloat16:
       return TF_HALF;
+    case kTfLiteBFloat16:
+      return TF_BFLOAT16;
     case kTfLiteFloat64:
       return TF_DOUBLE;
     case kTfLiteInt16:
@@ -116,6 +120,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteFloat32;
     case TF_HALF:
       return kTfLiteFloat16;
+    case TF_BFLOAT16:
+      return kTfLiteBFloat16;
     case TF_DOUBLE:
       return kTfLiteFloat64;
     case TF_INT16:
@@ -186,6 +192,8 @@ const char* TfLiteTypeToTfTypeName(TfLiteType type) {
       return "string";
     case kTfLiteFloat16:
       return "float16";
+    case kTfLiteBFloat16:
+      return "bfloat16";
     case kTfLiteFloat64:
       return "float64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
index c7361314aa38f5..7dfea9e6437c9d 100644
--- a/tensorflow/lite/delegates/flex/util_test.cc
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
@@ -118,6 +119,7 @@ TEST(UtilTest, TypeConversionsFromTFLite) {
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteNoType));
   EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteFloat32));
   EXPECT_EQ(TF_HALF, GetTensorFlowDataType(kTfLiteFloat16));
+  EXPECT_EQ(TF_BFLOAT16, GetTensorFlowDataType(kTfLiteBFloat16));
   EXPECT_EQ(TF_DOUBLE, GetTensorFlowDataType(kTfLiteFloat64));
   EXPECT_EQ(TF_INT16, GetTensorFlowDataType(kTfLiteInt16));
   EXPECT_EQ(TF_INT32, GetTensorFlowDataType(kTfLiteInt32));
@@ -136,6 +138,7 @@ TEST(UtilTest, TypeConversionsFromTFLite) {
 
 TEST(UtilTest, TypeConversionsFromTensorFlow) {
   EXPECT_EQ(kTfLiteFloat16, GetTensorFlowLiteType(TF_HALF));
+  EXPECT_EQ(kTfLiteBFloat16, GetTensorFlowLiteType(TF_BFLOAT16));
   EXPECT_EQ(kTfLiteFloat32, GetTensorFlowLiteType(TF_FLOAT));
   EXPECT_EQ(kTfLiteFloat64, GetTensorFlowLiteType(TF_DOUBLE));
   EXPECT_EQ(kTfLiteInt16, GetTensorFlowLiteType(TF_INT16));
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index d6eb080e6a74e9..5eeb17646eb9d9 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -1,12 +1,12 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 load("//tensorflow/lite/delegates/gpu:build_defs.bzl", "gpu_delegate_linkopts")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("@build_bazel_rules_apple//apple:macos.bzl", "macos_dylib")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 7b0d8312e2b154..535f077bf0f8b7 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -372,7 +372,9 @@ struct InferenceOptions {
 
   InferencePriority priority3 = InferencePriority::AUTO;
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  int gpu_invoke_loop_times = -1;
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
+  int gpu_invoke_loop_times = 1;
 #endif
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 84004f80218b9e..7e02516b5d8c87 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -134,6 +134,20 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "cl_command_buffer",
+    srcs = ["cl_command_buffer.cc"],
+    hdrs = ["cl_command_buffer.h"],
+    deps = [
+        ":cl_command_queue",
+        ":cl_event",
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "cl_command_queue",
     srcs = ["cl_command_queue.cc"],
@@ -395,8 +409,10 @@ cc_library(
     ],
     deps = [
         ":buffer",
+        ":cl_command_buffer",
         ":cl_command_queue",
         ":cl_device",
+        ":cl_event",
         ":cl_operation",
         ":environment",
         ":gpu_object",
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 08cf817da685b9..dc9a8d3a6ca649 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -545,6 +545,10 @@ class InferenceRunnerImpl : public CLInferenceRunner {
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
     // TODO(b/328511338): Remove code enabled by TFLITE_GPU_ENABLE_INVOKE_LOOP
     // when Async API solution is ready to replace it.
+    if (gpu_invoke_loop_times_ <= 0) {
+      return absl::InvalidArgumentError(
+          "gpu_invoke_loop_times must be positive");
+    }
     for (int i = 0; i < gpu_invoke_loop_times_; i++) {
       RETURN_IF_ERROR(RunWithoutExternalBufferCopy());
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc
new file mode 100644
index 00000000000000..1ee7f0c812a1b3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.cc
@@ -0,0 +1,98 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
+
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+CLCommandBuffer::CLCommandBuffer(CLCommandBuffer&& cb) : cb_(cb.cb_) {
+  cb.cb_ = nullptr;
+}
+
+CLCommandBuffer& CLCommandBuffer::operator=(CLCommandBuffer&& cb) {
+  if (this != &cb) {
+    Release();
+    std::swap(cb_, cb.cb_);
+  }
+  return *this;
+}
+
+void CLCommandBuffer::Release() {
+  if (cb_) {
+    clReleaseCommandBufferKHR(cb_);
+    cb_ = nullptr;
+  }
+}
+
+absl::Status CLCommandBuffer::Init(CLCommandQueue* queue,
+                                   bool simultaneous_use) {
+  cl_int errcode_ret = CL_SUCCESS;
+  std::vector<cl_command_buffer_properties_khr> properties;
+  if (simultaneous_use) {
+    properties.push_back(CL_COMMAND_BUFFER_FLAGS_KHR);
+    properties.push_back(CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR);
+  }
+  properties.push_back(0);
+  cl_command_buffer_properties_khr* properties_ptr =
+      properties.size() != 1 ? properties.data() : nullptr;
+  cl_command_queue cmd_queue = queue->queue();
+  cb_ = clCreateCommandBufferKHR(1, &cmd_queue, properties_ptr, &errcode_ret);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(absl::StrCat("Failed clCreateCommandBufferKHR.",
+                                            CLErrorCodeToString(errcode_ret)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLCommandBuffer::Finalize() {
+  cl_int errcode_ret = clFinalizeCommandBufferKHR(cb_);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(
+        absl::StrCat("Failed clFinalizeCommandBufferKHR.",
+                     CLErrorCodeToString(errcode_ret)));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status CLCommandBuffer::Enqueue(CLCommandQueue* queue, CLEvent* event) {
+  cl_event resulting_event;
+  cl_command_queue cmd_queue = queue->queue();
+  cl_int errcode_ret = clEnqueueCommandBufferKHR(
+      1, &cmd_queue, cb_, 0, nullptr, event ? &resulting_event : nullptr);
+  if (errcode_ret != CL_SUCCESS) {
+    return absl::InternalError(absl::StrCat("Failed clEnqueueCommandBufferKHR.",
+                                            CLErrorCodeToString(errcode_ret)));
+  }
+  if (event) {
+    *event = CLEvent(resulting_event);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
new file mode 100644
index 00000000000000..016346ddba7998
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class CLCommandBuffer {
+ public:
+  CLCommandBuffer() = default;
+  // Move only
+  CLCommandBuffer(CLCommandBuffer&& cb);
+  CLCommandBuffer& operator=(CLCommandBuffer&& cb);
+  CLCommandBuffer(const CLCommandBuffer&) = delete;
+  CLCommandBuffer& operator=(const CLCommandBuffer&) = delete;
+
+  ~CLCommandBuffer() { Release(); }
+
+  absl::Status Init(CLCommandQueue* queue, bool simultaneous_use = false);
+  absl::Status Finalize();
+  absl::Status Enqueue(CLCommandQueue* queue, CLEvent* event = nullptr);
+  cl_command_buffer_khr GetCommandBuffer() const { return cb_; }
+
+ private:
+  void Release();
+  cl_command_buffer_khr cb_ = nullptr;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_BUFFER_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 93982f5c556ccb..32cb168b505431 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -32,8 +32,12 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_model.h"
@@ -41,6 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/task/serialization_base.h"
 #include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
@@ -217,6 +222,50 @@ absl::Status GetBufferAsignment(
   return absl::OkStatus();
 }
 
+absl::Status ClarifyWithCommandBuffer(ProfilingCommandQueue* queue,
+                                      int num_tries, double cb_duration_ms,
+                                      const std::vector<CLNode*>& nodes,
+                                      std::vector<double>* time_ns) {
+  auto get_tasks_count = [&](int node_index) {
+    const int tasks_count = cb_duration_ms / ((*time_ns)[node_index] * 1e-6f);
+    return std::min(256, std::max(1, tasks_count));
+  };
+
+  std::vector<CLCommandBuffer> cbs(nodes.size() * num_tries);
+  for (int t = 0; t < num_tries; ++t) {
+    for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+      const int index = t * nodes.size() + node_index;
+      auto& cb = cbs[index];
+      RETURN_IF_ERROR(cb.Init(queue, /*simultaneous_use=*/false));
+      const int num_kernels_in_cb = get_tasks_count(node_index);
+      for (int j = 0; j < num_kernels_in_cb; ++j) {
+        RETURN_IF_ERROR(nodes[node_index]->cl_operation.AddToCommanBuffer(
+            cb.GetCommandBuffer()));
+      }
+      RETURN_IF_ERROR(cb.Finalize());
+    }
+  }
+  std::vector<CLEvent> events(nodes.size() * num_tries);
+  for (int t = 0; t < num_tries; ++t) {
+    for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+      const int index = t * nodes.size() + node_index;
+      RETURN_IF_ERROR(cbs[index].Enqueue(queue, &events[index]));
+    }
+  }
+  clFinish(queue->queue());
+  for (int node_index = 0; node_index < nodes.size(); ++node_index) {
+    double min_time_ns = std::numeric_limits<double>::max();
+    for (int t = 0; t < num_tries; ++t) {
+      const int num_kernels_in_cb = get_tasks_count(node_index);
+      double time_ns = events[t * nodes.size() + node_index].GetEventTimeNs() /
+                       num_kernels_in_cb;
+      min_time_ns = std::min(min_time_ns, time_ns);
+    }
+    (*time_ns)[node_index] = min_time_ns;
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
@@ -817,6 +866,50 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::ClarifyTimeWithCommandBuffer(
+    ProfilingCommandQueue* queue, ProfilingInfo* result) {
+  const int num_tries = 3;
+  const double cb_duration_ms = 10.0;  // looks like enough
+  const int node_group_count = 8;  // Current PowerVR drivers have issues with
+                                   // big amount of CB or big CB.
+  for (int node_index = 0; node_index < nodes_.size();
+       node_index += node_group_count) {
+    std::vector<CLNode*> nodes_to_clarify;
+    std::vector<double> times_ns;
+    for (int i = 0; i < node_group_count && node_index + i < nodes_.size();
+         ++i) {
+      nodes_to_clarify.push_back(&nodes_[node_index + i]);
+      times_ns.push_back(absl::ToDoubleNanoseconds(
+          result->dispatches[node_index + i].duration));
+    }
+    RETURN_IF_ERROR(ClarifyWithCommandBuffer(queue, num_tries, cb_duration_ms,
+                                             nodes_to_clarify, &times_ns));
+    for (int i = 0; i < node_group_count && node_index + i < nodes_.size();
+         ++i) {
+      result->dispatches[node_index + i].duration =
+          absl::Nanoseconds(times_ns[i]);
+    }
+  }
+  return absl::OkStatus();
+}
+
+absl::Status InferenceContext::ClarifyTimeMultipleEnqueue(
+    double ops_total_duration_ms, int min_ops, int max_ops,
+    ProfilingCommandQueue* queue, ProfilingInfo* result) {
+  queue->ResetMeasurements();
+  for (int i = 0; i < nodes_.size(); ++i) {
+    queue->SetEventsLabel(nodes_[i].name);
+    const int times =
+        ops_total_duration_ms /
+        absl::ToDoubleMilliseconds(result->dispatches[i].duration);
+    const int n = std::min(max_ops, std::max(min_ops, times));
+    RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+  }
+  RETURN_IF_ERROR(queue->WaitForCompletion());
+  *result = queue->GetProfilingInfo();
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
                                            ProfilingInfo* result) {
   queue->ResetMeasurements();
@@ -832,42 +925,23 @@ absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
   }
 
   if (gpu_info_.IsMali()) {
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(256.0, std::max(2.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
-    }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-    return absl::OkStatus();
+    return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/16.0,
+                                      /*min_ops=*/2, /*max_ops=*/256, queue,
+                                      result);
   }
 
   if (gpu_info_.IsPowerVR()) {
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(64.0, std::max(4.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
-    }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-
-    queue->ResetMeasurements();
-    for (int i = 0; i < nodes_.size(); ++i) {
-      queue->SetEventsLabel(nodes_[i].name);
-      const double times =
-          128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
-      const int n = std::min(1024.0, std::max(4.0, times));
-      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+    if (gpu_info_.SupportsExtension("cl_khr_command_buffer")) {
+      RETURN_IF_ERROR(ClarifyTimeWithCommandBuffer(queue, result));
+      RETURN_IF_ERROR(ClarifyTimeWithCommandBuffer(queue, result));
+    } else {
+      RETURN_IF_ERROR(ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/32.0,
+                                                 /*min_ops=*/4, /*max_ops=*/64,
+                                                 queue, result));
+      return ClarifyTimeMultipleEnqueue(/*ops_total_duration_ms=*/128.0,
+                                        /*min_ops=*/4, /*max_ops=*/1024, queue,
+                                        result);
     }
-    RETURN_IF_ERROR(queue->WaitForCompletion());
-    *result = queue->GetProfilingInfo();
-    return absl::OkStatus();
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index e5e883b07c3c34..e964b6185cb87f 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -148,6 +148,12 @@ class InferenceContext {
   void InitRecordableQueue(Environment* env);
 
   absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result);
+  absl::Status ClarifyTimeMultipleEnqueue(double ops_total_duration_ms,
+                                          int min_ops, int max_ops,
+                                          ProfilingCommandQueue* queue,
+                                          ProfilingInfo* result);
+  absl::Status ClarifyTimeWithCommandBuffer(ProfilingCommandQueue* queue,
+                                            ProfilingInfo* result);
 
   struct ExecutionHints {
     bool need_flush = false;
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index 94ee8dc3606d1e..24ef270196088e 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -91,6 +91,7 @@ cc_binary(
     ],
     deps = [
         "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_buffer",
         "//tensorflow/lite/delegates/gpu/cl:environment",
         "//tensorflow/lite/delegates/gpu/cl:inference_context",
         "//tensorflow/lite/delegates/gpu/cl:opencl_wrapper",
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 5ec70b4bb49227..05521cbe618311 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "third_party/opencl_headers/CL/cl_ext.h"
 #include "third_party/opencl_headers/CL/cl_platform.h"
 #include "tensorflow/lite/core/kernels/register.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
@@ -236,50 +237,37 @@ absl::Status RunSerializedTest(const std::string& model_name) {
   return absl::OkStatus();
 }
 
-absl::Status RunCommandBufferSample(int num_tests, int num_runs_per_test,
+absl::Status RunCommandBufferSample(int num_tests, double model_time_ms,
                                     Environment* env,
                                     InferenceContext* context) {
   if (!env->device().GetInfo().SupportsExtension("cl_khr_command_buffer")) {
     return absl::OkStatus();
   }
 
-  cl_command_queue command_queue = env->queue()->queue();
-  cl_int errcode_ret{CL_SUCCESS};
-  std::vector<cl_command_buffer_khr> cbs(num_runs_per_test);
+  int num_cbs = 3;
+  int num_inferences_in_cb = std::max(1.0, 100.0 / model_time_ms);
+  std::vector<CLCommandBuffer> cbs(num_cbs);
   for (auto& cb : cbs) {
-    cb = clCreateCommandBufferKHR(1, &command_queue, nullptr, &errcode_ret);
-    if (errcode_ret != CL_SUCCESS) {
-      return absl::InternalError("Failed clCreateCommandBufferKHR.");
-    }
-    RETURN_IF_ERROR(context->AddToCommanBuffer(cb));
-    errcode_ret = clFinalizeCommandBufferKHR(cb);
-    if (errcode_ret != CL_SUCCESS) {
-      return absl::InternalError("Failed clFinalizeCommandBufferKHR.");
+    RETURN_IF_ERROR(cb.Init(env->queue(), /*simultaneous_use=*/false));
+    for (int i = 0; i < num_inferences_in_cb; ++i) {
+      RETURN_IF_ERROR(context->AddToCommanBuffer(cb.GetCommandBuffer()));
     }
+    RETURN_IF_ERROR(cb.Finalize());
   }
 
   for (int i = 0; i < num_tests; ++i) {
     const auto start = std::chrono::high_resolution_clock::now();
     for (auto& cb : cbs) {
-      cl_int error_code =
-          clEnqueueCommandBufferKHR(1, &command_queue, cb, 0, nullptr, nullptr);
-      if (error_code != CL_SUCCESS) {
-        return absl::UnknownError(
-            absl::StrCat("Failed to clEnqueueCommandBufferKHR - ",
-                         CLErrorCodeToString(error_code)));
-      }
-      clFlush(command_queue);
+      RETURN_IF_ERROR(cb.Enqueue(env->queue()));
     }
-    clFinish(command_queue);
+    clFinish(env->queue()->queue());
     const auto end = std::chrono::high_resolution_clock::now();
     const double total_time_ms = (end - start).count() * 1e-6f;
-    const double average_inference_time = total_time_ms / num_runs_per_test;
+    const double average_inference_time =
+        total_time_ms / (num_cbs * num_inferences_in_cb);
     std::cout << "Total time CB - " << average_inference_time << "ms"
               << std::endl;
   }
-  for (auto& cb : cbs) {
-    clReleaseCommandBufferKHR(cb);
-  }
   return absl::OkStatus();
 }
 
@@ -326,9 +314,10 @@ absl::Status RunModelSample(const std::string& model_name) {
             << std::endl;
 
   const int num_tests = absl::GetFlag(FLAGS_num_tests);
-  const int num_runs_per_sec = std::max(
-      1, static_cast<int>(1000.0f / absl::ToDoubleMilliseconds(
-                                        profiling_info.GetTotalTime())));
+  const double model_time_ms =
+      absl::ToDoubleMilliseconds(profiling_info.GetTotalTime());
+  const int num_runs_per_sec =
+      std::max(1, static_cast<int>(1000.0f / model_time_ms));
   int num_runs_per_test = absl::GetFlag(FLAGS_num_runs_per_test);
   if (num_runs_per_test == 0) {
     num_runs_per_test = num_runs_per_sec;
@@ -347,7 +336,7 @@ absl::Status RunModelSample(const std::string& model_name) {
   }
   if (absl::GetFlag(FLAGS_benchmark_command_buffer)) {
     RETURN_IF_ERROR(
-        RunCommandBufferSample(num_tests, num_runs_per_test, &env, &context));
+        RunCommandBufferSample(num_tests, model_time_ms, &env, &context));
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 14384ce5be9a1c..27bb621c40dea9 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -154,6 +154,8 @@ absl::Status CreateVectorCopyData(const TfLiteTensor& src, T* dst) {
         return absl::OkStatus();
       case kTfLiteFloat16:
         return absl::UnimplementedError("src can't be float16.");
+      case kTfLiteBFloat16:
+        return absl::UnimplementedError("src can't be bfloat16.");
       case kTfLiteFloat64:
         for (int i = 0; i < n; ++i) {
           dst[i] = tflite::GetTensorData<double>(&src)[i];
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 409c7da45d5add..9fac6e598f1b1a 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -117,7 +117,7 @@ using tflite::delegates::utils::WriteSyncAttrs;
 namespace tflite {
 namespace gpu {
 namespace {
-
+// TODO(b/328628170): Add productive coverage to GPU delegate.
 using delegates::Serialization;
 using delegates::SerializationParams;
 using tflite::TFLITE_LOG_WARNING;
@@ -809,6 +809,10 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
   TfLiteStatus SetAttributes(TfLiteOpaqueContext* context,
                              TfLiteOpaqueNode* node, int tensor_index,
                              const TfLiteAttributeMap* attrs) override;
+  TfLiteStatus SetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   const TfLiteAttributeMap* attrs) override;
+  TfLiteStatus GetBufferAttributes(const TfLiteBackendBuffer* buffer,
+                                   TfLiteAttributeMap* attrs) override;
   TfLiteStatus Prepare(TfLiteOpaqueContext* context,
                        TfLiteOpaqueNode* node) override;
 
@@ -876,6 +880,37 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
     return desc_ahwb;
   }
 
+  // Validate the attributes passed in, return kTfLiteOk if the attributes
+  // meet the requirements. Return the registered buffer attributes in
+  // `buffer_attrs`.
+  static TfLiteStatus CheckAttributes(const TfLiteAttributeMap* attrs,
+                                      BufferAttributes& buffer_attrs) {
+    // Validate buffer attributes.
+    TFLITE_RET_CHECK_STATUS(
+        TfLiteAttributeMapIsBufferAttributeMap(attrs),
+        "calling RegisterBuffer with invalid attribute map type");
+    buffer_attrs = ReadBufferAttrs(attrs);
+    TFLITE_RET_CHECK_STATUS(
+        buffer_attrs.buffer_type.has_value(),
+        "calling RegisterBuffer with buffer resource type name unspecified");
+    TFLITE_RET_CHECK_STATUS(
+        buffer_attrs.buffer_type.value() != BufferType::kUnknown,
+        "calling RegisterBuffer with unknown buffer resource type");
+    size_t alignment = buffer_attrs.alignment.value_or(kRequiredByteAlignment);
+    TFLITE_RET_CHECK_STATUS(
+        alignment % kRequiredByteAlignment == 0,
+        "calling RegisterBuffer with non-zero buffer alignment");
+    size_t padding = buffer_attrs.padding.value_or(kRequiredBytePadding);
+    TFLITE_RET_CHECK_STATUS(
+        padding % kRequiredBytePadding == 0,
+        "calling RegisterBuffer with non-zero buffer padding");
+    size_t offset = buffer_attrs.offset.value_or(0);
+    TFLITE_RET_CHECK_STATUS(offset == 0,
+                            "calling RegisterBuffer with non-zero offset");
+
+    return kTfLiteOk;
+  }
+
   // For SupportedBufferTypes and SupportedSynchronizations
   const std::vector<const char*> supported_buffer_types_ = {
       ::tflite::delegates::utils::kBufferTypeAHardwareBufferBlob};
@@ -897,6 +932,9 @@ class DelegateAsyncKernel : public BackendAsyncKernelInterface {
 
   absl::flat_hash_map<TfLiteBufferHandle, UniquePtrAHardwareBuffer>
       buffer_by_handle_ ABSL_GUARDED_BY(eval_mutex_);
+
+  absl::flat_hash_map<AHardwareBuffer*, BufferAttributes> attributes_by_buffer_
+      ABSL_GUARDED_BY(eval_mutex_);
   std::vector<SyncType> output_sync_types_ ABSL_GUARDED_BY(eval_mutex_);
 };
 
@@ -1068,6 +1106,82 @@ TfLiteStatus DelegateAsyncKernel::SetAttributesImpl(
   return kTfLiteOk;
 }
 
+TfLiteStatus DelegateAsyncKernel::SetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, const TfLiteAttributeMap* attrs) {
+  TFLITE_ABORT_CHECK(buffer != nullptr, "Buffer is null");
+  TFLITE_ABORT_CHECK(attrs != nullptr, "Attribute is null");
+
+  // We depend on the availability of AHardwareBuffer.
+  TFLITE_RET_CHECK_STATUS(
+      TFLITE_AHWB_AVAILABLE(),
+      "calling tflite::gpu::DelegateAsyncKernel::SetBufferAttributes on device "
+      "without AHardwareBuffer support");
+  BufferAttributes buffer_attrs;
+  TFLITE_RET_CHECK_STATUS(CheckAttributes(attrs, buffer_attrs) == kTfLiteOk,
+                          "SetBufferAttributes(): Failed to check attributes");
+
+  // Validate ahardwarebuffer.
+  auto* ahwb =
+      reinterpret_cast<AHardwareBuffer*>(TfLiteBackendBufferGetPtr(buffer));
+  TFLITE_RET_CHECK_STATUS(ahwb != nullptr,
+                          "calling SetBufferAttributes with nullptr buffer");
+  UniquePtrAHardwareBuffer uptr_ahwb = Acquire(ahwb);
+  const AHardwareBuffer_Desc desc_ahwb = Describe(uptr_ahwb);
+  TFLITE_RET_CHECK_STATUS(desc_ahwb.format == AHARDWAREBUFFER_FORMAT_BLOB,
+                          "calling SetBufferAttributes with an AHardwareBuffer "
+                          "of format other than BLOB is not supported");
+  size_t size = buffer_attrs.size.value_or(desc_ahwb.width);
+  TFLITE_RET_CHECK_STATUS(
+      size <= desc_ahwb.width,
+      "calling SetBufferAttributes with buffer size larger than the actual "
+      "AHardwareBuffer size");
+
+  absl::MutexLock eval_lock(&eval_mutex_);
+  if (attributes_by_buffer_.find(uptr_ahwb.get()) !=
+      attributes_by_buffer_.end()) {
+    attributes_by_buffer_[uptr_ahwb.get()] = buffer_attrs;
+  } else {
+    TFLITE_LOG_PROD(
+        TFLITE_LOG_ERROR,
+        "SetBufferAttributes(): Unable to find the buffer in the map.");
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus DelegateAsyncKernel::GetBufferAttributes(
+    const TfLiteBackendBuffer* buffer, TfLiteAttributeMap* attrs) {
+  TFLITE_ABORT_CHECK(buffer != nullptr, "Buffer is null");
+  TFLITE_ABORT_CHECK(attrs != nullptr, "Attribute map is null");
+
+  // We depend on the availability of AHardwareBuffer.
+  TFLITE_RET_CHECK_STATUS(
+      TFLITE_AHWB_AVAILABLE(),
+      "calling tflite::gpu::DelegateAsyncKernel::GetBufferAttributes on device "
+      "without AHardwareBuffer support");
+  TFLITE_RET_CHECK_STATUS(
+      TfLiteAttributeMapIsBufferAttributeMap(attrs),
+      "calling GetBufferAttributes with an invalid attribute map type");
+
+  // Validate ahardwarebuffer.
+  auto* ahwb =
+      reinterpret_cast<AHardwareBuffer*>(TfLiteBackendBufferGetPtr(buffer));
+  TFLITE_RET_CHECK_STATUS(ahwb != nullptr,
+                          "calling GetBufferAttributes with nullptr buffer");
+  UniquePtrAHardwareBuffer uptr_ahwb = Acquire(ahwb);
+  const AHardwareBuffer_Desc desc_ahwb = Describe(uptr_ahwb);
+  TFLITE_RET_CHECK_STATUS(desc_ahwb.format == AHARDWAREBUFFER_FORMAT_BLOB,
+                          "calling GetBufferAttributes with an AHardwareBuffer "
+                          "of format other than "
+                          "BLOB is not supported");
+
+  absl::MutexLock eval_lock(&eval_mutex_);
+  auto it = attributes_by_buffer_.find(uptr_ahwb.get());
+  TFLITE_RET_CHECK_STATUS(it != attributes_by_buffer_.end(),
+                          "Unable to find the buffer.");
+  WriteBufferAttrs(it->second, attrs);
+  return kTfLiteOk;
+}
+
 TfLiteStatus DelegateAsyncKernel::Prepare(TfLiteOpaqueContext* opaque_context,
                                           TfLiteOpaqueNode* opaque_node) {
   // The following cast is safe only because this code is part of the
@@ -1125,34 +1239,14 @@ TfLiteStatus DelegateAsyncKernel::RegisterBufferImpl(
   TFLITE_ABORT_CHECK(buffer != nullptr, "");                  // Crash OK
   TFLITE_ABORT_CHECK(attrs != nullptr, "");                   // Crash OK
   TFLITE_ABORT_CHECK(handle != kTfLiteNullBufferHandle, "");  // Crash OK
-
-  // Validate buffer attributes.
-  TFLITE_RET_CHECK_STATUS(
-      TfLiteAttributeMapIsBufferAttributeMap(attrs),
-      "calling RegisterBuffer with invalid attribute map type");
-  auto buffer_attrs = ReadBufferAttrs(attrs);
-  TFLITE_RET_CHECK_STATUS(
-      buffer_attrs.buffer_type.has_value(),
-      "calling RegisterBuffer with buffer resource type name unspecified");
-  TFLITE_RET_CHECK_STATUS(
-      buffer_attrs.buffer_type.value() != BufferType::kUnknown,
-      "calling RegisterBuffer with unknown buffer resource type");
-  size_t alignment = buffer_attrs.alignment.value_or(kRequiredByteAlignment);
-  TFLITE_RET_CHECK_STATUS(
-      alignment % kRequiredByteAlignment == 0,
-      "calling RegisterBuffer with invalid buffer alignment");
-  size_t padding = buffer_attrs.padding.value_or(kRequiredBytePadding);
-  TFLITE_RET_CHECK_STATUS(padding % kRequiredBytePadding == 0,
-                          "calling RegisterBuffer with invalid buffer padding");
-  size_t offset = buffer_attrs.offset.value_or(0);
-  TFLITE_RET_CHECK_STATUS(offset == 0,
-                          "calling RegisterBuffer with non-zero offset");
-
   // We depend on the availability of AHardwareBuffer.
   TFLITE_RET_CHECK_STATUS(
       TFLITE_AHWB_AVAILABLE(),
       "calling tflite::gpu::DelegateAsyncKernel::RegisterBuffer on device "
       "without AHardwareBuffer support");
+  BufferAttributes buffer_attrs;
+  TFLITE_RET_CHECK_STATUS(CheckAttributes(attrs, buffer_attrs) == kTfLiteOk,
+                          "RegisterBufferImpl(): Failed to check attributes");
 
   // Retrieve and validate the buffer.
   auto* ahwb =
@@ -1177,6 +1271,10 @@ TfLiteStatus DelegateAsyncKernel::RegisterBufferImpl(
       buffer_by_handle_.try_emplace(handle, std::move(uptr_ahwb));
   TFLITE_RET_CHECK_STATUS(did_something,
                           "RegisterBuffer called with duplicate handle");
+
+  auto [iterator, check] =
+      attributes_by_buffer_.try_emplace(it->second.get(), buffer_attrs);
+  TFLITE_RET_CHECK_STATUS(check, "RegisterBuffer called with same buffer");
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.cc b/tensorflow/lite/delegates/gpu/delegate_options.cc
index 7b7059df37e4d1..e596045e2a9d47 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.cc
+++ b/tensorflow/lite/delegates/gpu/delegate_options.cc
@@ -35,7 +35,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
   options.last_delegate_node_index = std::numeric_limits<int>::max();
 #endif
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  options.gpu_invoke_loop_times = -1;
+  options.gpu_invoke_loop_times = 1;
 #endif
   return options;
 }
diff --git a/tensorflow/lite/delegates/gpu/delegate_options.h b/tensorflow/lite/delegates/gpu/delegate_options.h
index 98f1347bb7febe..b52d45c8abd5b1 100644
--- a/tensorflow/lite/delegates/gpu/delegate_options.h
+++ b/tensorflow/lite/delegates/gpu/delegate_options.h
@@ -144,6 +144,8 @@ typedef struct {
   int last_delegate_node_index;
 #endif
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
+  // Number of times to invoke the inference in GPU delegate, to collect more
+  // accurate latency result. Default as 1, which is the original behavior.
   int gpu_invoke_loop_times;
 #endif
 } TfLiteGpuDelegateOptionsV2;
diff --git a/tensorflow/lite/delegates/gpu/gl/converters/BUILD b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
index 70687b224b021a..8403a80594776c 100644
--- a/tensorflow/lite/delegates/gpu/gl/converters/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/converters/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_gpu_tests_tags",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 0c555a4e6b9a12..8f23fa22a91fbc 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -1,16 +1,16 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "workspace_root")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load(
     "//tensorflow/lite:special_rules.bzl",
     "tflite_ios_lab_runner",
     "tflite_ios_per_kernel_test",
     "tflite_portable_test_suite",
 )
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow:tensorflow.bzl", "workspace_root")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index 06c295646eec6e..c668e8345d0395 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -1,14 +1,14 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load(
     "//tensorflow/lite:special_rules.bzl",
     "tflite_ios_lab_runner",
     "tflite_ios_per_kernel_test",
     "tflite_portable_test_suite",
 )
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 6bb146bb6e79b3..fe3c30897dd71d 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/delegates/opaque_delegate_test.cc b/tensorflow/lite/delegates/opaque_delegate_test.cc
index 8025da59a52bfd..be650d253d2721 100644
--- a/tensorflow/lite/delegates/opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/opaque_delegate_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
+#include <stddef.h>
+#include <stdint.h>
+
 #include <fstream>
 #include <memory>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_opaque.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/testing/util.h"
 
 using ::testing::ContainsRegex;
@@ -66,20 +70,19 @@ TEST(TestOpaqueDelegate, AddDelegate) {
                                        void* data) -> TfLiteStatus {
     // Test that an unnamed delegate kernel can be passed to the TF Lite
     // runtime.
-    TfLiteRegistrationExternal* registration_external =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                         /*name*/ nullptr,
-                                         /*version=*/1);
-    TfLiteRegistrationExternalSetInit(
-        registration_external,
-        [](TfLiteOpaqueContext* context, const char* buffer,
-           size_t length) -> void* { return nullptr; });
+    TfLiteOperator* registration_external =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                             /*name*/ nullptr,
+                             /*version=*/1);
+    TfLiteOperatorSetInit(registration_external,
+                          [](TfLiteOpaqueContext* context, const char* buffer,
+                             size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration = nullptr;
+      TfLiteOperator* registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, i, &node,
                                                 &registration);
       int fd = -1;
@@ -123,20 +126,19 @@ TEST(TestOpaqueDelegate, ModelWithCustomOpAndInitData) {
   opaque_delegate_builder.Prepare = [](TfLiteOpaqueContext* opaque_context,
                                        TfLiteOpaqueDelegate* opaque_delegate,
                                        void* data) -> TfLiteStatus {
-    TfLiteRegistrationExternal* registration_external =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate,
-                                         /*name*/ nullptr,
-                                         /*version=*/1);
-    TfLiteRegistrationExternalSetInit(
-        registration_external,
-        [](TfLiteOpaqueContext* context, const char* buffer,
-           size_t length) -> void* { return nullptr; });
+    TfLiteOperator* registration_external =
+        TfLiteOperatorCreate(kTfLiteBuiltinDelegate,
+                             /*name*/ nullptr,
+                             /*version=*/1);
+    TfLiteOperatorSetInit(registration_external,
+                          [](TfLiteOpaqueContext* context, const char* buffer,
+                             size_t length) -> void* { return nullptr; });
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
     for (int i = 0; i < execution_plan->size; ++i) {
       TfLiteOpaqueNode* node = nullptr;
-      TfLiteRegistrationExternal* registration = nullptr;
+      TfLiteOperator* registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(opaque_context, i, &node,
                                                 &registration);
       int fd = -1;
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
index bc99b46631667e..e77018bc350a25 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.cc
@@ -64,7 +64,7 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       const int node_index = params->nodes_to_replace->data[i];
 
       TfLiteOpaqueNode* delegated_node = nullptr;
-      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
+      TfLiteOperator* delegated_node_registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, node_index, &delegated_node, &delegated_node_registration);
 
@@ -82,7 +82,7 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       node_output_tensors_set_.insert(output_tensor);
 
       builtin_code_[i] =
-          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
+          TfLiteOperatorGetBuiltInCode(delegated_node_registration);
     }
 
     // Determine which tensors are external (the TFLite runtime takes care
@@ -176,10 +176,10 @@ int helpers::CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
 }
 
 bool SampleStableDelegate::IsNodeSupportedByDelegate(
-    const TfLiteRegistrationExternal* registration_external,
-    const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const {
+    const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+    TfLiteOpaqueContext* context) const {
   TfLiteBuiltinOperator builtin_operator =
-      TfLiteRegistrationExternalGetBuiltInCode(registration_external);
+      TfLiteOperatorGetBuiltInCode(registration_external);
   void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
   if (builtin_operator == kTfLiteBuiltinAdd) {
     TfLiteAddParams* params = reinterpret_cast<TfLiteAddParams*>(builtin_data);
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
index f497f4a4b64954..4301ea77ed4b2c 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate.h
@@ -42,10 +42,9 @@ class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
   // Returns true if the inputs of 'node' are two tensors of float32 with the
   // same shape and the operation is addition or subtraction (without fused
   // activation).
-  bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node,
-      TfLiteOpaqueContext* context) const override;
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
 
   // No-op. The delegate doesn't have extra steps to perform during
   // initialization.
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
index 8a4b5df8840ed4..f6b42165cef4ad 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.cc
@@ -89,12 +89,12 @@ class SampleStableDelegateKernel : public SimpleOpaqueDelegateKernelInterface {
       const int node_index = nodes_to_execute->data[i];
 
       TfLiteOpaqueNode* delegated_node = nullptr;
-      TfLiteRegistrationExternal* delegated_node_registration = nullptr;
+      TfLiteOperator* delegated_node_registration = nullptr;
       TfLiteOpaqueContextGetNodeAndRegistration(
           context, node_index, &delegated_node, &delegated_node_registration);
 
       builtin_codes_[subgraph_index][i] =
-          TfLiteRegistrationExternalGetBuiltInCode(delegated_node_registration);
+          TfLiteOperatorGetBuiltInCode(delegated_node_registration);
 
       for (int n = 0; n < TfLiteOpaqueNodeNumberOfInputs(delegated_node); ++n) {
         auto input_tensor =
@@ -419,7 +419,7 @@ TfLiteStatus SampleStableDelegate::ComputeCompatibleCalleeSubgraphs(
   for (int i = 0; i < execution_plan->size; ++i) {
     int node_index = execution_plan->data[i];
     TfLiteOpaqueNode* node = nullptr;
-    TfLiteRegistrationExternal* registration = nullptr;
+    TfLiteOperator* registration = nullptr;
     status = TfLiteOpaqueContextGetNodeAndRegistration(
         current_context, node_index, &node, &registration);
     if (status != kTfLiteOk) {
@@ -427,7 +427,7 @@ TfLiteStatus SampleStableDelegate::ComputeCompatibleCalleeSubgraphs(
     }
 
     TfLiteBuiltinOperator builtin_operator =
-        TfLiteRegistrationExternalGetBuiltInCode(registration);
+        TfLiteOperatorGetBuiltInCode(registration);
     if (builtin_operator == kTfLiteBuiltinWhile) {
       void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
       const auto* op_data =
@@ -503,10 +503,10 @@ int helpers::CalculateNumElements(const TfLiteOpaqueTensor* opaque_tensor) {
 }
 
 bool SampleStableDelegate::IsNodeSupportedByDelegate(
-    const TfLiteRegistrationExternal* registration_external,
-    const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const {
+    const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+    TfLiteOpaqueContext* context) const {
   TfLiteBuiltinOperator builtin_operator =
-      TfLiteRegistrationExternalGetBuiltInCode(registration_external);
+      TfLiteOperatorGetBuiltInCode(registration_external);
   void* builtin_data = TfLiteOpaqueNodeGetBuiltinData(node);
   // List of supported / unsupported ops.
   switch (builtin_operator) {
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
index b4db00855513a9..bf784f4de82624 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_with_control_flow.h
@@ -44,10 +44,9 @@ class SampleStableDelegate : public SimpleOpaqueDelegateInterface {
   // SampleStableDelegate supports float32 input type only.
   // Returns true if the inputs of 'node' are two tensors of float32 with the
   // same shape and the operation is supported (without fused activation).
-  bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node,
-      TfLiteOpaqueContext* context) const override;
+  bool IsNodeSupportedByDelegate(const TfLiteOperator* registration_external,
+                                 const TfLiteOpaqueNode* node,
+                                 TfLiteOpaqueContext* context) const override;
 
   // No-op. The delegate doesn't have extra steps to perform during
   // initialization.
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
index 193adcc9718e15..96fbd3b3b91082 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.cc
@@ -30,19 +30,19 @@ limitations under the License.
 
 namespace tflite {
 namespace {
-TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
+TfLiteOperator* CreateDelegateKernelRegistration(
     SimpleOpaqueDelegateInterface* delegate) {
-  TfLiteRegistrationExternal* kernel_registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
-                                       /*version=*/1);
+  TfLiteOperator* kernel_registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(),
+                           /*version=*/1);
 
-  TfLiteRegistrationExternalSetFree(
+  TfLiteOperatorSetFree(
       kernel_registration,
       [](TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteRegistrationExternalSetInit(
+  TfLiteOperatorSetInit(
       kernel_registration,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -60,7 +60,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
         }
         return delegate_kernel.release();
       });
-  TfLiteRegistrationExternalSetPrepare(
+  TfLiteOperatorSetPrepare(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -69,7 +69,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistration(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -100,7 +100,7 @@ TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
     const int node_id = plan->data[i];
 
     TfLiteOpaqueNode* opaque_node;
-    TfLiteRegistrationExternal* registration_external;
+    TfLiteOperator* registration_external;
     TfLiteOpaqueContextGetNodeAndRegistration(
         opaque_context, node_id, &opaque_node, &registration_external);
 
@@ -110,7 +110,7 @@ TfLiteStatus DelegatePrepare(TfLiteOpaqueContext* opaque_context,
     }
   }
 
-  TfLiteRegistrationExternal* delegate_kernel_registration =
+  TfLiteOperator* delegate_kernel_registration =
       CreateDelegateKernelRegistration(simple_opaque_delegate);
 
   // Transfers ownership of delegate_kernel_registration to the opaque_context.
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
index abaf3ee3dbeb9d..5096eaa135d481 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate.h
@@ -85,8 +85,8 @@ class SimpleOpaqueDelegateInterface {
 
   // Returns true if 'node' is supported by the delegate. False otherwise.
   virtual bool IsNodeSupportedByDelegate(
-      const TfLiteRegistrationExternal* registration_external,
-      const TfLiteOpaqueNode* node, TfLiteOpaqueContext* context) const = 0;
+      const TfLiteOperator* registration_external, const TfLiteOpaqueNode* node,
+      TfLiteOpaqueContext* context) const = 0;
 
   // Initialize the delegate before finding and replacing TfLite nodes with
   // delegate kernels, for example, retrieving some TFLite settings from
diff --git a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
index eb32b9281d5419..3ec43793ad7a23 100644
--- a/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_opaque_delegate_test.cc
@@ -272,18 +272,17 @@ TEST(DelegateTest, TestDataMultiAddBin_MultiInputMultiOutput_FullyDelegated) {
   TfLiteModelDelete(model);
 }
 
-TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
+TfLiteOperator* CreateDelegateKernelRegistrationImpl(
     SimpleOpaqueDelegateInterface* delegate) {
-  TfLiteRegistrationExternal* kernel_registration =
-      TfLiteRegistrationExternalCreate(kTfLiteBuiltinDelegate, delegate->Name(),
-                                       1);
-  TfLiteRegistrationExternalSetFree(
+  TfLiteOperator* kernel_registration =
+      TfLiteOperatorCreate(kTfLiteBuiltinDelegate, delegate->Name(), 1);
+  TfLiteOperatorSetFree(
       kernel_registration,
       [](TfLiteOpaqueContext* context, void* buffer) -> void {
         delete reinterpret_cast<SimpleOpaqueDelegateInterface*>(buffer);
       });
 
-  TfLiteRegistrationExternalSetInit(
+  TfLiteOperatorSetInit(
       kernel_registration,
       [](TfLiteOpaqueContext* context, const char* buffer,
          size_t length) -> void* {
@@ -302,7 +301,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
         }
         return delegate_kernel.release();
       });
-  TfLiteRegistrationExternalSetPrepare(
+  TfLiteOperatorSetPrepare(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -311,7 +310,7 @@ TfLiteRegistrationExternal* CreateDelegateKernelRegistrationImpl(
                 TfLiteOpaqueNodeGetUserData(opaque_node));
         return delegate_kernel->Prepare(context, opaque_node);
       });
-  TfLiteRegistrationExternalSetInvoke(
+  TfLiteOperatorSetInvoke(
       kernel_registration,
       [](TfLiteOpaqueContext* context,
          TfLiteOpaqueNode* opaque_node) -> TfLiteStatus {
@@ -394,7 +393,7 @@ TEST_F(TestDelegate, SetBufferHandle) {
     TfLiteIntArray* execution_plan;
     TF_LITE_ENSURE_STATUS(
         TfLiteOpaqueContextGetExecutionPlan(opaque_context, &execution_plan));
-    TfLiteRegistrationExternal* delegate_kernel_registration =
+    TfLiteOperator* delegate_kernel_registration =
         CreateDelegateKernelRegistrationImpl(simple_opaque_delegate);
 
     return TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
diff --git a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
index 4d7af0f07e73ce..30642a3df2a517 100644
--- a/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/batch_matrix_multiply_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tflite {
 namespace xnnpack {
 
-class BatchMatrixMultiplyTest : public testing::Test {
+class DISABLED_BatchMatrixMultiplyTest : public testing::Test {
  public:
   // std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
   auto get_delegate(int num_threads = 1) {
@@ -52,7 +52,7 @@ class BatchMatrixMultiplyTest : public testing::Test {
   std::mt19937 rng_ = std::mt19937(random_device_());
 };
 
-TEST_F(BatchMatrixMultiplyTest, 3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -65,7 +65,7 @@ TEST_F(BatchMatrixMultiplyTest, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -83,7 +83,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastOne3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit3D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit3D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -101,7 +101,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, 4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -115,7 +115,7 @@ TEST_F(BatchMatrixMultiplyTest, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastOne4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastOne4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -149,7 +149,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastOne4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit4D) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, BroadcastImplicit4D) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -175,7 +175,7 @@ TEST_F(BatchMatrixMultiplyTest, BroadcastImplicit4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, 4D_AdjY) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, 4D_AdjY) {
   const auto outer_batch = shape_rng();
   const auto inner_batch = shape_rng();
   const auto height = shape_rng();
@@ -190,7 +190,7 @@ TEST_F(BatchMatrixMultiplyTest, 4D_AdjY) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, MultiThreading) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, MultiThreading) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto input1_channels = channels_rng();
@@ -203,7 +203,7 @@ TEST_F(BatchMatrixMultiplyTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(BatchMatrixMultiplyTest, WeightsCache) {
+TEST_F(DISABLED_BatchMatrixMultiplyTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 05d9c13bba6f14..5defe0e657f3e0 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -32,6 +32,18 @@ namespace xnnpack {
 class DynamicallyQuantizedFullyConnectedTest
     : public testing::TestWithParam<WeightsType> {};
 
+int GenInputChannels(const std::function<int()> &rng,
+                     WeightsType weights_type) {
+  switch (weights_type) {
+    case WeightsType::kChannelWiseQuantizedInt8:
+    case WeightsType::kTensorWiseQuantizedInt8:
+      return rng();
+    case WeightsType::kChannelWiseQuantizedInt4:
+      // Int4 quantized kernels only support even number of channels.
+      return (rng() / 2) * 2;
+  }
+}
+
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
@@ -45,7 +57,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
   auto rng = std::mt19937(random_device());
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -53,7 +66,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -73,14 +86,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -100,7 +114,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -108,7 +123,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -128,14 +143,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, width, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -184,7 +200,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -192,7 +209,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -214,14 +231,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, height, width, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -243,7 +261,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
   const auto batch = shape_rng();
   const auto height = shape_rng();
   const auto width = shape_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -251,7 +270,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .KeepDims(true)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -271,7 +290,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -279,7 +299,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .NoBias()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -299,7 +319,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -307,7 +328,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .ReluActivation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -327,7 +348,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -335,7 +357,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .Relu6Activation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -355,7 +377,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -363,7 +386,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .ReluMinus1To1Activation()
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -384,14 +407,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
       .InputShape({batch, input_channels})
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
@@ -415,7 +439,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
   auto channels_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
   const auto batch = batch_rng();
-  const auto input_channels = channels_rng();
+  WeightsType weights_type = GetParam();
+  const auto input_channels = GenInputChannels(channels_rng, weights_type);
   const auto output_channels = channels_rng();
 
   DynamicallyQuantizedFullyConnectedTester()
@@ -423,13 +448,15 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       .InputChannels(input_channels)
       .OutputChannels(output_channels)
       .WeightsCache(weights_cache.get())
-      .WeightsType(GetParam())
+      .WeightsType(weights_type)
       .Test(xnnpack_delegate.get());
 }
 
 // Returns a human readable string representation of the test parameter.
 std::string TestParamToString(testing::TestParamInfo<WeightsType> param) {
   switch (param.param) {
+    case WeightsType::kChannelWiseQuantizedInt4:
+      return "ChannelWiseQuantizedInt4";
     case WeightsType::kChannelWiseQuantizedInt8:
       return "ChannelWiseQuantizedInt8";
     case WeightsType::kTensorWiseQuantizedInt8:
@@ -444,6 +471,7 @@ INSTANTIATE_TEST_SUITE_P(
     DynamicallyQuantizedFullyConnectedTest,
     DynamicallyQuantizedFullyConnectedTest,
     testing::Values(WeightsType::kTensorWiseQuantizedInt8,
+                    WeightsType::kChannelWiseQuantizedInt4,
                     WeightsType::kChannelWiseQuantizedInt8),
     TestParamToString);
 
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
index 13a0da90a67cf7..f07de21cf352ff 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/string.h"  // from @flatbuffers
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -157,7 +158,19 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
   std::vector<flatbuffers::Offset<Operator>> operators;
 
   /*********************** Generate filter and bias data **********************/
-  std::vector<int8_t> filter_data(InputChannels() * OutputChannels());
+  int filter_size_bytes = -1;
+  switch (WeightsType()) {
+    case WeightsType::kChannelWiseQuantizedInt4: {
+      filter_size_bytes = (InputChannels() * OutputChannels() + 1) / 2;
+      break;
+    }
+    case WeightsType::kChannelWiseQuantizedInt8:
+    case WeightsType::kTensorWiseQuantizedInt8: {
+      filter_size_bytes = InputChannels() * OutputChannels();
+      break;
+    }
+  }
+  std::vector<int8_t> filter_data(filter_size_bytes);
   std::generate(filter_data.begin(), filter_data.end(), std::ref(filter_rng));
   std::vector<float> bias_data(OutputChannels());
   std::generate(bias_data.begin(), bias_data.end(), std::ref(bias_rng));
@@ -185,15 +198,22 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
       builder,
       builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
       TensorType_FLOAT32, /*buffer=*/0));
-
+  tflite::TensorType filter_tensor_type;
   std::vector<float> filter_scale;
   std::vector<int64_t> filter_zero_point;
   switch (WeightsType()) {
+    case WeightsType::kChannelWiseQuantizedInt4:
+      filter_tensor_type = tflite::TensorType_INT4;
+      filter_scale.assign(OutputChannels(), FilterScale());
+      filter_zero_point.assign(OutputChannels(), 0);
+      break;
     case WeightsType::kChannelWiseQuantizedInt8:
+      filter_tensor_type = tflite::TensorType_INT8;
       filter_scale.assign(OutputChannels(), FilterScale());
       filter_zero_point.assign(OutputChannels(), 0);
       break;
     case WeightsType::kTensorWiseQuantizedInt8: {
+      filter_tensor_type = tflite::TensorType_INT8;
       filter_scale = {FilterScale()};
       filter_zero_point = {0};
       break;
@@ -202,7 +222,7 @@ std::vector<char> DynamicallyQuantizedFullyConnectedTester::CreateTfLiteModel()
   tensors.emplace_back(CreateTensor(
       builder,
       builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-      TensorType_INT8, /*buffer=*/1, /*name=*/0,
+      filter_tensor_type, /*buffer=*/1, /*name=*/0,
       CreateQuantizationParameters(
           builder, /*min=*/0, /*max=*/0,
           builder.CreateVector<float>(filter_scale),
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
index e3929e8ea86a51..e073fb79780f5d 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -30,6 +31,7 @@ namespace tflite {
 namespace xnnpack {
 
 enum class WeightsType {
+  kChannelWiseQuantizedInt4,
   kChannelWiseQuantizedInt8,
   kTensorWiseQuantizedInt8,
 };
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 0e1a2d3c30c6f8..518575bcbe596f 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <limits>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -141,12 +142,14 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
 
       return xnn_datatype_quint8;
     }
-    case kTfLiteInt8: {
+    case kTfLiteInt8:
+    case kTfLiteInt4: {
       if (tensor.quantization.type != kTfLiteAffineQuantization) {
         TF_LITE_KERNEL_LOG(context,
-                           "unsupported quantization type %d for INT8 "
+                           "unsupported quantization type %d for %s "
                            "tensor %d in XNNPACK delegate",
-                           tensor.quantization.type, t);
+                           tensor.quantization.type,
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       const auto quantization_params =
@@ -154,26 +157,27 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
               tensor.quantization.params);
       if (quantization_params->scale == nullptr) {
         TF_LITE_KERNEL_LOG(context,
-                           "missing scale quantization parameters for INT8 "
+                           "missing scale quantization parameters for %s "
                            "tensor %d in XNNPACK delegate",
-                           t);
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       if (quantization_params->zero_point == nullptr) {
         TF_LITE_KERNEL_LOG(context,
                            "missing zero point quantization parameters for "
-                           "INT8 tensor %d in XNNPACK delegate",
-                           t);
+                           "%s tensor %d in XNNPACK delegate",
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
       if (quantization_params->scale->size !=
           quantization_params->zero_point->size) {
         TF_LITE_KERNEL_LOG(context,
                            "mismatching number of scale (%d) and zero "
-                           "point (%d) quantization parameters for INT8 "
+                           "point (%d) quantization parameters for %s "
                            "tensor %d in XNNPACK delegate",
                            quantization_params->scale->size,
-                           quantization_params->zero_point->size, t);
+                           quantization_params->zero_point->size,
+                           TfLiteTypeGetName(tensor.type), t);
         return xnn_datatype_invalid;
       }
 
@@ -182,14 +186,22 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
         if (!std::isnormal(scale) || scale <= 0.0f) {
           TF_LITE_KERNEL_LOG(context,
                              "unsupported scale value (%f) in channel %d for "
-                             "INT8 tensor %d in XNNPACK delegate",
-                             scale, i, t);
+                             "%s tensor %d in XNNPACK delegate",
+                             scale, i, TfLiteTypeGetName(tensor.type), t);
           return xnn_datatype_invalid;
         }
       }
 
       if (quantization_params->scale->size == 1) {
         // Per-tensor quantization parameters
+        if (kTfLiteInt8 != tensor.type) {
+          TF_LITE_KERNEL_LOG(context,
+                             "unsupported per-tensor quantization scale "
+                             "parameter for %s tensor %d in XNNPACK delegate",
+                             TfLiteTypeGetName(tensor.type), t);
+          return xnn_datatype_invalid;
+        }
+
         const int zero_point = quantization_params->zero_point->data[0];
         if (zero_point < std::numeric_limits<int8_t>::min() ||
             zero_point > std::numeric_limits<int8_t>::max()) {
@@ -209,15 +221,25 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
              c <
              SizeOfDimension(&tensor, quantization_params->quantized_dimension);
              c++) {
-          if (quantization_params->zero_point->data[c] != 0) {
+          if (quantization_params->zero_point->data[c] != 0 &&
+              (tensor.type != kTfLiteInt4 &&
+               quantization_params->zero_point->data[c] != 8)) {
             TF_LITE_KERNEL_LOG(context,
                                "unsupported zero-point value %d in channel "
-                               "%d of INT8 tensor %d in XNNPACK delegate",
-                               quantization_params->zero_point->data[c], c, t);
+                               "%d of %s tensor %d in XNNPACK delegate",
+                               quantization_params->zero_point->data[c], c,
+                               TfLiteTypeGetName(tensor.type), t);
             return xnn_datatype_invalid;
           }
         }
-        return xnn_datatype_qcint8;
+        switch (tensor.type) {
+          case kTfLiteInt4:
+            return xnn_datatype_qcint4;
+          case kTfLiteInt8:
+            return xnn_datatype_qcint8;
+          default:
+            return xnn_datatype_invalid;
+        }
       } else {
         TF_LITE_KERNEL_LOG(
             context,
@@ -687,6 +709,7 @@ class Delegate {
 
   TfLiteXNNPackDelegateOptions options_{};
   VariableHolder variable_holder_;
+  std::mutex workspace_mutex_;
 };
 
 class Subgraph {
@@ -989,6 +1012,7 @@ class Subgraph {
               dims.size(), dims.data(), data, XNN_INVALID_VALUE_ID, flags,
               &xnnpack_id);
           break;
+        case xnn_datatype_qcint4:
         case xnn_datatype_qcint8:
         case xnn_datatype_qcint32:
           status = xnn_define_channelwise_quantized_tensor_value(
@@ -1094,7 +1118,8 @@ class Subgraph {
   }
 
   TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node,
-                       bool enable_subgraph_reshaping) {
+                       bool enable_subgraph_reshaping, Delegate* delegate) {
+    std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
     if (enable_subgraph_reshaping) {
       xnn_status status = xnn_status_invalid_state;
       for (int i = 0; i < inputs_.size(); ++i) {
@@ -1150,7 +1175,9 @@ class Subgraph {
     }
   }
 
-  TfLiteStatus Invoke(TfLiteContext* context, bool enable_subgraph_reshaping) {
+  TfLiteStatus Invoke(TfLiteContext* context, bool enable_subgraph_reshaping,
+                      Delegate* delegate) {
+    std::lock_guard<std::mutex> lock(delegate->workspace_mutex_);
     bool any_pointers_changed = false;
     for (std::pair<int, void*> io_info : externals_) {
       const TfLiteTensor& tensor = context->tensors[io_info.first];
@@ -2009,6 +2036,77 @@ class Subgraph {
     return kTfLiteError;
   }
 
+  static TfLiteStatus CheckTensorFloat32OrQCInt4OrQCInt8Type(
+      const Delegate& delegate, TfLiteContext* context,
+      const TfLiteTensor& tensor, int expected_quantized_dimension,
+      int tensor_index, int node_index) {
+    switch (tensor.type) {
+      case kTfLiteFloat32:
+        return kTfLiteOk;
+      case kTfLiteInt4:
+      case kTfLiteInt8:
+        if (delegate.support_signed_8bit_quantization() &&
+            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type)) {
+          if (tensor.quantization.type != kTfLiteAffineQuantization) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantization type %d in tensor #%d in node #%d",
+                tensor.quantization.type, tensor_index, node_index);
+            return kTfLiteError;
+          }
+          const TfLiteAffineQuantization* quantization_params =
+              static_cast<const TfLiteAffineQuantization*>(
+                  tensor.quantization.params);
+          if (quantization_params->scale == nullptr) {
+            TF_LITE_MAYBE_KERNEL_LOG(context,
+                                     "missing scale quantization parameters in "
+                                     "tensor #%d in node #%d",
+                                     tensor_index, node_index);
+            return kTfLiteError;
+          }
+          if (quantization_params->scale->size > 1 &&
+              quantization_params->quantized_dimension !=
+                  expected_quantized_dimension) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantized dimension %d in tensor #%d in node #%d",
+                quantization_params->quantized_dimension, tensor_index,
+                node_index);
+            return kTfLiteError;
+          }
+          return kTfLiteOk;
+        }
+        break;
+      case kTfLiteUInt8:
+        if (delegate.support_unsigned_8bit_quantization()) {
+          const auto* quantization_params =
+              static_cast<const TfLiteAffineQuantization*>(
+                  tensor.quantization.params);
+          if (tensor.quantization.type != kTfLiteAffineQuantization ||
+              quantization_params->quantized_dimension != 0 ||
+              quantization_params->scale == nullptr ||
+              quantization_params->zero_point == nullptr ||
+              quantization_params->scale->size != 1 ||
+              quantization_params->zero_point->size != 1) {
+            TF_LITE_MAYBE_KERNEL_LOG(
+                context,
+                "unsupported quantization type %d in tensor #%d in node #%d",
+                tensor.quantization.type, tensor_index, node_index);
+            return kTfLiteError;
+          }
+          return kTfLiteOk;
+        }
+        break;
+      default:
+        break;
+    }
+
+    TF_LITE_MAYBE_KERNEL_LOG(
+        context, "unsupported type %s in tensor #%d in node #%d",
+        TfLiteTypeGetName(tensor.type), tensor_index, node_index);
+    return kTfLiteError;
+  }
+
   static TfLiteStatus CheckTensorFloat32OrQInt32Type(const Delegate& delegate,
                                                      TfLiteContext* context,
                                                      const TfLiteTensor& tensor,
@@ -3802,7 +3900,7 @@ class Subgraph {
           delegate, logging_context, filter_tensor, node->inputs->data[1],
           node_index));
     } else {
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt8Type(
+      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQCInt4OrQCInt8Type(
           delegate, logging_context, filter_tensor,
           /*expected_quantized_dimension=*/0, node->inputs->data[1],
           node_index));
@@ -3853,7 +3951,8 @@ class Subgraph {
 
     bool dynamically_quantized = (delegate.enable_latest_operators() &&
                                   (input_tensor.type == kTfLiteFloat32 &&
-                                   filter_tensor.type == kTfLiteInt8));
+                                   (filter_tensor.type == kTfLiteInt4 ||
+                                    filter_tensor.type == kTfLiteInt8)));
     if (input_tensor.type != output_tensor.type ||
         ((input_tensor.type != filter_tensor.type) && !dynamically_quantized)) {
       TF_LITE_MAYBE_KERNEL_LOG(
@@ -3871,6 +3970,15 @@ class Subgraph {
       return kTfLiteError;
     }
 
+    if (filter_tensor.type == kTfLiteInt4 && input_channels % 2 == 1) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported odd number of inputs channels (%d) in FULLY_CONNECTED"
+          " operator #%d",
+          input_channels, node_index);
+      return kTfLiteError;
+    }
+
     int32_t num_input_elements = 1;
     for (int i = 0; i < NumDimensions(&input_tensor); i++) {
       if (SizeOfDimension(&input_tensor, i) <= 0) {
@@ -4004,11 +4112,15 @@ class Subgraph {
         std::vector<size_t> filter_dims(
             &filter_tensor.dims->data[0],
             &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
+        const xnn_datatype filter_datatype = GetXNNPackDatatype(
+            logging_context, filter_tensor, node->inputs->data[1]);
+        int32_t zero_point_value = filter_params->zero_point->data[0];
         uint32_t kernel_id = XNN_INVALID_VALUE_ID;
-        status = xnn_define_channelwise_quantized_tensor_value(
-            subgraph, xnn_datatype_qcint8, filter_params->scale->data,
-            filter_dims.size(), /*channel_dim=*/0, filter_dims.data(),
-            GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+        status = xnn_define_channelwise_quantized_tensor_value_v2(
+            subgraph, filter_datatype, zero_point_value,
+            filter_params->scale->data, filter_dims.size(), /*channel_dim=*/0,
+            filter_dims.data(), GetTensorData<int8_t>(&filter_tensor),
+            XNN_INVALID_VALUE_ID,
             /*flags=*/0, &kernel_id);
         if (status != xnn_status_success) {
           TF_LITE_KERNEL_LOG(
@@ -5395,6 +5507,27 @@ class Subgraph {
         delegate, logging_context, output_tensor, node->outputs->data[0],
         node_index));
 
+    if (output_tensor.type == kTfLiteUInt8 ||
+        output_tensor.type == kTfLiteInt8) {
+      if (input_tensor.params.zero_point != output_tensor.params.zero_point) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "Mismatching quantization zero point across the input "
+            "(%" PRId32 ") and the output (%" PRId32
+            ") for RESHAPE operator #%d",
+            input_tensor.params.zero_point, output_tensor.params.zero_point,
+            node_index);
+        return kTfLiteError;
+      }
+      if (input_tensor.params.scale != output_tensor.params.scale) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "Mismatching quantization scale across the input (%f) "
+            "and the output (%f) for RESHAPE operator #%d",
+            input_tensor.params.scale, output_tensor.params.scale, node_index);
+        return kTfLiteError;
+      }
+    }
     if (subgraph != nullptr) {
       const xnn_status status = xnn_define_static_reshape(
           subgraph, num_new_dimensions, new_shape.data(),
@@ -6497,8 +6630,10 @@ class Subgraph {
     return enable_subgraph_reshaping_;
   }
 
+  inline Delegate* GetDelegate() const { return delegate_; }
+
  private:
-  Subgraph(const Delegate& delegate, xnn_runtime_t runtime,
+  Subgraph(Delegate& delegate, xnn_runtime_t runtime,
            const std::unordered_set<int>& externals, std::vector<int>& inputs,
            std::vector<int>& outputs,
            std::unordered_map<int, uint32_t>& tflite_tensor_to_xnnpack)
@@ -6511,6 +6646,7 @@ class Subgraph {
     outputs_ = outputs;
     has_variables_ = !delegate.GetAllVariableTensors().empty();
     enable_subgraph_reshaping_ = delegate.enable_subgraph_reshaping();
+    delegate_ = &delegate;
   }
 
   // XNNPACK Runtime (subgraph + workspace) with smart-pointer for lifetime
@@ -6538,6 +6674,7 @@ class Subgraph {
   bool has_variables_ = false;
   bool variables_set_up_ = false;
   bool enable_subgraph_reshaping_ = false;
+  Delegate* delegate_;
 };
 
 TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
@@ -6946,7 +7083,8 @@ TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   Subgraph* subgraph = static_cast<Subgraph*>(node->user_data);
   return static_cast<Subgraph*>(node->user_data)
-      ->Prepare(context, node, subgraph->EnableSubgraphReshaping());
+      ->Prepare(context, node, subgraph->EnableSubgraphReshaping(),
+                subgraph->GetDelegate());
 }
 
 TfLiteStatus SubgraphInvoke(TfLiteContext* context, TfLiteNode* node) {
@@ -6956,7 +7094,8 @@ TfLiteStatus SubgraphInvoke(TfLiteContext* context, TfLiteNode* node) {
 
   Subgraph* subgraph = static_cast<Subgraph*>(node->user_data);
   return static_cast<Subgraph*>(node->user_data)
-      ->Invoke(context, subgraph->EnableSubgraphReshaping());
+      ->Invoke(context, subgraph->EnableSubgraphReshaping(),
+               subgraph->GetDelegate());
 }
 
 void SubgraphFree(TfLiteContext* context, void* buffer) {
diff --git a/tensorflow/lite/examples/label_image/CMakeLists.txt b/tensorflow/lite/examples/label_image/CMakeLists.txt
index 08044b1675beb3..9874801f34fa31 100644
--- a/tensorflow/lite/examples/label_image/CMakeLists.txt
+++ b/tensorflow/lite/examples/label_image/CMakeLists.txt
@@ -21,7 +21,7 @@ populate_source_vars("${TFLITE_SOURCE_DIR}/examples/label_image"
   FILTER "_test\\.cc$"
 )
 list(APPEND TFLITE_LABEL_IMAGE_SRCS
-  ${TSL_SOURCE_DIR}/tsl/util/stats_calculator.cc
+  ${XLA_SOURCE_DIR}/xla/tsl/util/stats_calculator.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 9a5ececc1ece1b..6fa1532106e83c 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -13,8 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite")
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index a1329c13fb6bc5..f4f92bcbe6860b 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -13,17 +13,20 @@
 # limitations under the License.
 # ==============================================================================
 
+# buildifier: disable=out-of-order-load
+
 load("@flatbuffers//:build_defs.bzl", "DEFAULT_FLATC_ARGS", "flatbuffer_android_library", "flatbuffer_cc_library", "flatbuffer_java_library")
+
+# copybara:comment_begin(oss-only)
+load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
+# copybara:comment_end
+
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
 load("//tensorflow/lite:special_rules.bzl", "nnapi_plugin_impl_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 # copybara:uncomment load("//tools/build_defs/proto/cpp:cc_proto_library.bzl", "cc_proto_library")
 
-# copybara:comment_begin(oss-only)
-load("@local_tsl//tsl/platform/default:build_config.bzl", "tf_proto_library_py")
-# copybara:comment_end
-
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
index 23cedaae8879a4..ad625bdff90c29 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/BUILD
@@ -14,11 +14,11 @@
 # ==============================================================================
 
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+load("//tensorflow:tensorflow.bzl", "clean_dep")
 
 # Mini benchmarking helps in validating HW acceleration across Android and iOS.
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "clean_dep")
 load("build_defs.bzl", "cc_library_with_forced_in_process_benchmark_variant", "embedded_binary")
 load("special_rules.bzl", "libjpeg_deps", "libjpeg_handle_deps", "libjpeg_hdrs_deps", "minibenchmark_visibility_allowlist")
 
diff --git a/tensorflow/lite/experimental/resource/BUILD b/tensorflow/lite/experimental/resource/BUILD
index 45ed5395dbc9b8..bed57f7489d25e 100644
--- a/tensorflow/lite/experimental/resource/BUILD
+++ b/tensorflow/lite/experimental/resource/BUILD
@@ -6,6 +6,32 @@ package(
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "cache_buffer",
+    srcs = ["cache_buffer.cc"],
+    hdrs = [
+        "cache_buffer.h",
+        "//tensorflow/lite/core/c:common.h",
+    ],
+    deps = [
+        ":resource",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ],
+)
+
+cc_test(
+    name = "cache_buffer_test",
+    srcs = ["cache_buffer_test.cc"],
+    deps = [
+        ":cache_buffer",
+        "//tensorflow/lite/c:common",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "resource",
     srcs = [
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.cc b/tensorflow/lite/experimental/resource/cache_buffer.cc
new file mode 100644
index 00000000000000..0e221589b4cc64
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer.cc
@@ -0,0 +1,56 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/resource/cache_buffer.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace resource {
+
+constexpr char kCacheBufferTensorName[] = "CacheBuffer";
+
+TfLiteStatus CacheBuffer::Initialize(const TfLiteIntArray &shape,
+                                     const TfLiteType &type) {
+  // Set basic parameters.
+  tensor_.name = kCacheBufferTensorName;
+  tensor_.allocation_type = kTfLiteDynamic;
+  tensor_.type = type;
+
+  // Set the shape and allocate the memory.
+  tensor_.dims = TfLiteIntArrayCopy(&shape);
+  const size_t num_bytes = TfLiteTypeGetSize(type) * NumElements(&tensor_);
+  TfLiteTensorRealloc(num_bytes, &tensor_);
+
+  memset(tensor_.data.raw, 0, tensor_.bytes);
+  is_initialized_ = true;
+  return kTfLiteOk;
+}
+
+size_t CacheBuffer::GetNumEntries() const { return num_entries_; }
+
+void CacheBuffer::SetNumEntries(size_t count) {
+  TFLITE_DCHECK(count <= tensor_.dims->data[2]);
+  num_entries_ = count;
+}
+
+}  // namespace resource
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/resource/cache_buffer.h b/tensorflow/lite/experimental/resource/cache_buffer.h
new file mode 100644
index 00000000000000..1e500fab07c269
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer.h
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/experimental/resource/resource_variable.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace resource {
+
+/// WARNING: Experimental interface, subject to change.
+// A Cache Buffer class. Useful for keeping the keys and values of a
+// transformer block attention mechanism in autoregressive decode.
+// Ops can access this buffer and add tensors to it. It also keeps track of the
+// number of used entries in the cache.
+class CacheBuffer : public ResourceVariable {
+ public:
+  CacheBuffer() = default;
+  CacheBuffer(const CacheBuffer &) = delete;
+  CacheBuffer &operator=(const CacheBuffer &) = delete;
+  // Initialize tensor of a certain shape using the provided type.
+  TfLiteStatus Initialize(const TfLiteIntArray &shape, const TfLiteType &type);
+  size_t GetNumEntries() const;
+  void SetNumEntries(size_t count);
+
+ private:
+  // The number of entries currently used in the buffer;
+  size_t num_entries_ = 0;
+};
+
+}  // namespace resource
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_CACHE_BUFFER_H_
diff --git a/tensorflow/lite/experimental/resource/cache_buffer_test.cc b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
new file mode 100644
index 00000000000000..6b54f6c787138d
--- /dev/null
+++ b/tensorflow/lite/experimental/resource/cache_buffer_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/resource/cache_buffer.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace resource {
+
+TEST(CacheBufferTest, Initialize) {
+  TfLiteIntArray* shape = TfLiteIntArrayCreate(4);
+  shape->data[0] = 1;
+  shape->data[1] = 3;
+  shape->data[2] = 5;
+  shape->data[3] = 7;
+
+  TfLiteType type = kTfLiteFloat32;
+  CacheBuffer cache_buffer;
+  cache_buffer.Initialize(*shape, type);
+
+  EXPECT_EQ(cache_buffer.GetTensor()->type, type);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->size, 4);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[0], 1);
+  EXPECT_EQ(cache_buffer.GetTensor()->dims->data[1], 3);
+  EXPECT_EQ(cache_buffer.GetTensor()->bytes, 420);
+  ASSERT_NE(cache_buffer.GetTensor()->data.raw, nullptr);
+  EXPECT_EQ(cache_buffer.GetNumEntries(), 0);
+  cache_buffer.SetNumEntries(3);
+  EXPECT_EQ(cache_buffer.GetNumEntries(), 3);
+  TfLiteIntArrayFree(shape);
+}
+
+}  // namespace resource
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/resource/resource_variable.h b/tensorflow/lite/experimental/resource/resource_variable.h
index 3f34082c85f553..881aaa1f0b3a19 100644
--- a/tensorflow/lite/experimental/resource/resource_variable.h
+++ b/tensorflow/lite/experimental/resource/resource_variable.h
@@ -50,7 +50,7 @@ class ResourceVariable : public ResourceBase {
     return is_initialized_ ? tensor_.bytes : 0;
   }
 
- private:
+ protected:
   // The tensor (and its buffer stored in `tensor_.data` is fully owned by
   // the `ResourceVariable` object.
   TfLiteTensor tensor_;
diff --git a/tensorflow/lite/experimental/shlo/BUILD b/tensorflow/lite/experimental/shlo/BUILD
index ae0fefd70dc03a..a274bab28d0f4a 100644
--- a/tensorflow/lite/experimental/shlo/BUILD
+++ b/tensorflow/lite/experimental/shlo/BUILD
@@ -86,7 +86,6 @@ cc_library(
     name = "bf16",
     hdrs = ["bf16.h"],
     deps = [
-        ":has_keyword",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/log:absl_check",
     ],
@@ -106,12 +105,6 @@ cc_test(
 cc_library(
     name = "f16",
     hdrs = ["f16.h"],
-    deps = [":has_keyword"],
-)
-
-cc_library(
-    name = "has_keyword",
-    hdrs = ["has_keyword.h"],
 )
 
 cc_library(
@@ -127,7 +120,6 @@ cc_library(
     name = "dispatch",
     hdrs = ["dispatch.h"],
     visibility = ["//tensorflow/lite/experimental/shlo:__subpackages__"],
-    deps = [":data_type"],
 )
 
 cc_test(
diff --git a/tensorflow/lite/experimental/shlo/README.md b/tensorflow/lite/experimental/shlo/README.md
index 2864114b08147f..4cc46da7b13429 100644
--- a/tensorflow/lite/experimental/shlo/README.md
+++ b/tensorflow/lite/experimental/shlo/README.md
@@ -5,9 +5,13 @@ StableHLO kernels.
 
 ## Contributing
 
+Please review the [Tensorflow Contributing Guide] for the repository's
+contributing guidelines.
+
 The code makes use of C++17 and is built using Bazel.
 
-Unless specified, the [Google style guide] should be specified.
+Unless specified, the [Google style guide] should be followed. Clang-format with
+`google` style should be used for automatic code formatting.
 
 To keep familiarity for people who are used to working with StableHLO, the data
 structures try to follow the naming and hierarchy that are found in the
@@ -195,11 +199,25 @@ using ADB.
 
 ```sh
 adb push shlo/ops/op_name_test /data/local/tmp
-ash shell /data/local/tmp/op_name_test
+adb shell /data/local/tmp/op_name_test
 ```
 
 #### iOS
 
+##### Prerequisites
+
+Follow the instructions for setting up the iOS development environment in the
+TensorFlow Lite [Build for iOS] guide. The `configure` script must be run and
+you must opt-in to iOS development.
+
+##### Building
+
+```
+bazel build -c opt --config=ios_arm64 ops:op_name_test
+```
+
+##### Testing
+
 TODO:
 
 [stablehlo]: https://github.com/openxla/stablehlo/blob/main/docs/spec.md
@@ -208,3 +226,5 @@ TODO:
 [GoogleTest]: https://github.com/google/googletest
 [Google Benchmark]: https://github.com/google/benchmark
 [Google style guide]: https://google.github.io/styleguide/cppguide.html
+[Tensorflow Contributing Guide]: https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md
+[Build for iOS]: https://www.tensorflow.org/lite/guide/build_ios
diff --git a/tensorflow/lite/experimental/shlo/bf16.h b/tensorflow/lite/experimental/shlo/bf16.h
index 33b614d1888435..3f228fb161af14 100644
--- a/tensorflow/lite/experimental/shlo/bf16.h
+++ b/tensorflow/lite/experimental/shlo/bf16.h
@@ -19,7 +19,7 @@ limitations under the License.
 #if defined(__STDCPP_BFLOAT16_T__)
 #include <stdfloat>
 namespace shlo_ref {
-using BF16 = bfloat16_t;
+using BF16 = ::std::bfloat16_t;
 }  // namespace shlo_ref
 
 #else
diff --git a/tensorflow/lite/experimental/shlo/data_type.h b/tensorflow/lite/experimental/shlo/data_type.h
index 8e8fe2d6202911..f313fdc175ce58 100644
--- a/tensorflow/lite/experimental/shlo/data_type.h
+++ b/tensorflow/lite/experimental/shlo/data_type.h
@@ -95,10 +95,17 @@ using StorageType = typename Storage<data_type>::Type;
 
 constexpr bool IsBool(DataType data_type) { return data_type == DataType::kI1; }
 
-constexpr bool IsInteger(DataType data_type) {
+constexpr bool IsSignedInteger(DataType data_type) {
   return data_type == DataType::kSI4 || data_type == DataType::kSI8 ||
          data_type == DataType::kSI16 || data_type == DataType::kSI32;
 }
+
+constexpr bool IsUnsignedInteger(DataType data_type) { return false; }
+
+constexpr bool IsInteger(DataType data_type) {
+  return IsSignedInteger(data_type) || IsUnsignedInteger(data_type);
+}
+
 constexpr bool IsFloat(DataType data_type) {
   return data_type == DataType::kBF16 || data_type == DataType::kF16 ||
          data_type == DataType::kF32;
diff --git a/tensorflow/lite/experimental/shlo/dispatch.h b/tensorflow/lite/experimental/shlo/dispatch.h
index a3d4a641f2eb4c..ecadbd7e1e8fc7 100644
--- a/tensorflow/lite/experimental/shlo/dispatch.h
+++ b/tensorflow/lite/experimental/shlo/dispatch.h
@@ -81,6 +81,24 @@ limitations under the License.
     }                                                                   \
   }
 
+#define DISPATCH_BOOL_INT(name, element_type, ...)                      \
+  {                                                                     \
+    switch (element_type) {                                             \
+      case DataType::kI1:                                               \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kI1>(__VA_ARGS__)));   \
+      case DataType::kSI4:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI4>(__VA_ARGS__)));  \
+      case DataType::kSI8:                                              \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI8>(__VA_ARGS__)));  \
+      case DataType::kSI16:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI16>(__VA_ARGS__))); \
+      case DataType::kSI32:                                             \
+        RETURN_OK_STATUS_IF_VOID((name<DataType::kSI32>(__VA_ARGS__))); \
+      default:                                                          \
+        return absl::InvalidArgumentError("Unsupported element type");  \
+    }                                                                   \
+  }
+
 #define DISPATCH_BOOL_INT_FLOAT(name, element_type, ...)                \
   {                                                                     \
     switch (element_type) {                                             \
diff --git a/tensorflow/lite/experimental/shlo/f16.h b/tensorflow/lite/experimental/shlo/f16.h
index 2496b31b84dc9f..f18170cb052682 100644
--- a/tensorflow/lite/experimental/shlo/f16.h
+++ b/tensorflow/lite/experimental/shlo/f16.h
@@ -16,21 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
 
-#include "tensorflow/lite/experimental/shlo/has_keyword.h"
-
 #if defined(__STDCPP_FLOAT16_T__)
 #include <stdfloat>
 namespace shlo_ref {
-using F16 = float16_t;
+using F16 = ::std::float16_t;
 }  // namespace shlo_ref
 
-#elif __has_keyword(_Float16)
+#else
 namespace shlo_ref {
 using F16 = _Float16;
 }  // namespace shlo_ref
 
-#else
-#error Type F16 is not available
 #endif
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/has_keyword.h b/tensorflow/lite/experimental/shlo/has_keyword.h
deleted file mode 100644
index 548c86eec4de36..00000000000000
--- a/tensorflow/lite/experimental/shlo/has_keyword.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
-
-// CAUTION: __is_identifier behaves opposite how you would expect!
-// '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
-// the compiler and '1' otherwise.  borrowed from LLVM __config header under
-// Apache license 2.
-// (https://www.mend.io/blog/top-10-apache-license-questions-answered/)
-
-#ifndef __is_identifier       // Optional of course.
-#define __is_identifier(x) 1  // Compatibility with non-clang compilers.
-#endif
-
-// More sensible macro for keyword detection
-#define __has_keyword(__x) !(__is_identifier(__x))
-
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
diff --git a/tensorflow/lite/experimental/shlo/legacy/BUILD b/tensorflow/lite/experimental/shlo/legacy/BUILD
index 64f986fd607afa..2bed7f5ca9c6fd 100644
--- a/tensorflow/lite/experimental/shlo/legacy/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/BUILD
@@ -26,6 +26,7 @@ cc_library(
         "src/storage.h",
         "src/util.h",
     ],
+    tags = ["no_oss"],
     deps = [
         ":float",
         "@com_google_absl//absl/log",
@@ -42,6 +43,7 @@ cc_library(
     hdrs = [
         "src/debug.h",
     ],
+    tags = ["no_oss"],
     deps = [
         ":float",
         ":shlo",
@@ -60,6 +62,7 @@ cc_library(
         "src/f16.h",
         "src/has_keyword.h",
     ],
+    tags = ["no_oss"],
     deps = [
     ],
 )
diff --git a/tensorflow/lite/experimental/shlo/legacy/bench/BUILD b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
index 2555f70adc7b31..4f5e487f8f955c 100644
--- a/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
@@ -7,6 +7,7 @@ cc_library(
     hdrs = [
         "util.h",
     ],
+    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/experimental/shlo/legacy:float",
     ],
@@ -17,6 +18,7 @@ cc_binary(
     srcs = [
         "shlo_benchmark.cc",
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
@@ -35,6 +37,7 @@ cc_binary(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:float",
diff --git a/tensorflow/lite/experimental/shlo/legacy/test/BUILD b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
index 275dae134f0edf..7adc819d12da6a 100644
--- a/tensorflow/lite/experimental/shlo/legacy/test/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
@@ -4,6 +4,7 @@ cc_library(
     name = "matchers",
     testonly = True,
     hdrs = ["matchers.h"],
+    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/experimental/shlo/legacy:debug",
         "@com_google_googletest//:gtest_main",
@@ -17,6 +18,7 @@ cc_library(
     hdrs = [
         "util.h",
     ],
+    tags = ["no_oss"],
     visibility = ["//tensorflow/lite/experimental/shlo/legacy/bench:__subpackages__"],
     deps = [
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -32,6 +34,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -48,6 +51,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -63,6 +67,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -78,6 +83,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -94,6 +100,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":matchers",
         ":util",
@@ -111,6 +118,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":matchers",
         ":util",
@@ -128,6 +136,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -143,6 +152,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/experimental/shlo/legacy:debug",
         "//tensorflow/lite/experimental/shlo/legacy:shlo",
@@ -157,6 +167,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":util",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
@@ -172,6 +183,7 @@ cc_test(
     ],
     data = [
     ],
+    tags = ["no_oss"],
     deps = [
         ":matchers",
         "//tensorflow/lite/experimental/shlo/legacy:debug",
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
index 45eaa7771807e2..f839714f3ecaf2 100644
--- a/tensorflow/lite/experimental/shlo/ops/BUILD
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -35,10 +35,10 @@ cc_test(
     srcs = ["is_finite_test.cc"],
     linkopts = shlo_ref_linkopts(),
     deps = [
-        ":benchmark_util",
         ":is_finite",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -70,7 +70,9 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     deps = [
+        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/status",
     ],
 )
@@ -118,7 +120,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -129,13 +130,30 @@ cc_library(
     hdrs = ["test_util.h"],
     deps = [
         "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
         "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_googletest//:gtest",
     ],
 )
 
+cc_library(
+    name = "unary_elementwise_test_util",
+    testonly = True,
+    hdrs = ["unary_elementwise_test_util.h"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "abs",
     srcs = ["abs.cc"],
@@ -156,6 +174,7 @@ cc_test(
     deps = [
         ":abs",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:quantize",
         "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
         "//tensorflow/lite/experimental/shlo:shape",
@@ -173,7 +192,6 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:bf16",
-        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:dispatch",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -188,6 +206,7 @@ cc_test(
     deps = [
         ":cbrt",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -195,7 +214,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -223,6 +241,7 @@ cc_test(
     deps = [
         ":ceil",
         ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -230,7 +249,6 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
-        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -244,7 +262,6 @@ cc_library(
         ":unary_elementwise",
         ":util",
         "//tensorflow/lite/experimental/shlo:bf16",
-        "//tensorflow/lite/experimental/shlo:data_type",
         "//tensorflow/lite/experimental/shlo:dispatch",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:tensor",
@@ -258,6 +275,281 @@ cc_test(
     deps = [
         ":cosine",
         ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "sine",
+    srcs = ["sine.cc"],
+    hdrs = ["sine.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sine_test",
+    srcs = ["sine_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sine",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tanh",
+    srcs = ["tanh.cc"],
+    hdrs = ["tanh.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "tanh_test",
+    srcs = ["tanh_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":tanh",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "sqrt",
+    srcs = ["sqrt.cc"],
+    hdrs = ["sqrt.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sqrt_test",
+    srcs = ["sqrt_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sqrt",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "exponential",
+    srcs = ["exponential.cc"],
+    hdrs = ["exponential.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":exponential",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "floor",
+    srcs = ["floor.cc"],
+    hdrs = ["floor.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "floor_test",
+    srcs = ["floor_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":floor",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "log",
+    srcs = ["log.cc"],
+    hdrs = ["log.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "log_test",
+    srcs = ["log_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":log",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "exponential_minus_one",
+    srcs = ["exponential_minus_one.cc"],
+    hdrs = ["exponential_minus_one.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "exponential_minus_one_test",
+    srcs = ["exponential_minus_one_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":exponential_minus_one",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "log_plus_one",
+    srcs = ["log_plus_one.cc"],
+    hdrs = ["log_plus_one.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "log_plus_one_test",
+    srcs = ["log_plus_one_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":log_plus_one",
+        ":test_util",
+        ":unary_elementwise_test_util",
         "//tensorflow/lite/experimental/shlo:bf16",
         "//tensorflow/lite/experimental/shlo:f16",
         "//tensorflow/lite/experimental/shlo:quantize",
@@ -265,7 +557,467 @@ cc_test(
         "//tensorflow/lite/experimental/shlo:shape",
         "//tensorflow/lite/experimental/shlo:status_matcher",
         "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "logistic",
+    srcs = ["logistic.cc"],
+    hdrs = ["logistic.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "logistic_test",
+    srcs = ["logistic_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":logistic",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "sign",
+    srcs = ["sign.cc"],
+    hdrs = ["sign.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "sign_test",
+    srcs = ["sign_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":sign",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:bf16",
+        "//tensorflow/lite/experimental/shlo:f16",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "negate",
+    srcs = ["negate.cc"],
+    hdrs = ["negate.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "negate_test",
+    srcs = ["negate_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":negate",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "count_leading_zeros",
+    srcs = ["count_leading_zeros.cc"],
+    hdrs = ["count_leading_zeros.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "count_leading_zeros_test",
+    srcs = ["count_leading_zeros_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":count_leading_zeros",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "not",
+    srcs = ["not.cc"],
+    hdrs = ["not.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "not_test",
+    srcs = ["not_test.cc"],
+    deps = [
+        ":not",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "popcnt",
+    srcs = ["popcnt.cc"],
+    hdrs = ["popcnt.h"],
+    deps = [
+        ":unary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "popcnt_test",
+    srcs = ["popcnt_test.cc"],
+    deps = [
+        ":popcnt",
+        ":test_util",
+        ":unary_elementwise_test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "binary_elementwise",
+    hdrs = ["binary_elementwise.h"],
+    deps = [
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
+    ],
+)
+
+cc_test(
+    name = "binary_elementwise_test",
+    srcs = ["binary_elementwise_test.cc"],
+    linkopts = shlo_ref_linkopts(),
+    deps = [
+        ":binary_elementwise",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "binary_elementwise_test_util",
+    testonly = True,
+    hdrs = ["binary_elementwise_test_util.h"],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "multiply",
+    srcs = ["multiply.cc"],
+    hdrs = ["multiply.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "multiply_test",
+    srcs = ["multiply_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":multiply",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "divide",
+    srcs = ["divide.cc"],
+    hdrs = ["divide.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "divide_test",
+    srcs = ["divide_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":divide",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "subtract",
+    srcs = ["subtract.cc"],
+    hdrs = ["subtract.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "subtract_test",
+    srcs = ["subtract_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":subtract",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "and",
+    srcs = ["and.cc"],
+    hdrs = ["and.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "and_test",
+    srcs = ["and_test.cc"],
+    deps = [
+        ":and",
+        ":binary_elementwise_test_util",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "or",
+    srcs = ["or.cc"],
+    hdrs = ["or.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "or_test",
+    srcs = ["or_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":or",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "xor",
+    srcs = ["xor.cc"],
+    hdrs = ["xor.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "xor_test",
+    srcs = ["xor_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":test_util",
+        ":xor",
+        "//tensorflow/lite/experimental/shlo:data_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "maximum",
+    srcs = ["maximum.cc"],
+    hdrs = ["maximum.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "maximum_test",
+    srcs = ["maximum_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":maximum",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "minimum",
+    srcs = ["minimum.cc"],
+    hdrs = ["minimum.h"],
+    deps = [
+        ":binary_elementwise",
+        ":util",
+        "//tensorflow/lite/experimental/shlo:dispatch",
+        "//tensorflow/lite/experimental/shlo:tensor",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_test(
+    name = "minimum_test",
+    srcs = ["minimum_test.cc"],
+    deps = [
+        ":binary_elementwise_test_util",
+        ":minimum",
+        ":test_util",
+        "//tensorflow/lite/experimental/shlo:quantize",
+        "//tensorflow/lite/experimental/shlo:quantized_tensor_element_type",
+        "//tensorflow/lite/experimental/shlo:shape",
+        "//tensorflow/lite/experimental/shlo:status_matcher",
+        "//tensorflow/lite/experimental/shlo:tensor",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/experimental/shlo/ops/abs.cc b/tensorflow/lite/experimental/shlo/ops/abs.cc
index 8f8f7415b3197b..dd92713df972ed 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs.cc
@@ -33,32 +33,26 @@ AbsOp Create(typename AbsOp::Attributes) { return AbsOp{}; }
 
 absl::Status Prepare(AbsOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.abs constraint (C2) is not satisfied (incompatible baseline "
-        "types.).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("abs"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("abs"), input, output));
   return absl::OkStatus();
 }
 
 absl::Status Evaluate(AbsOp& op, const Tensor& input, Tensor& output) {
   Abs abs;
-  if (input.IsPerAxisQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerChannel,
-                       input.quantized_tensor_element_type().StorageType(),
-                       input.quantized_tensor_element_type().ExpressedType(),
-                       abs, input, output);
-  } else if (input.IsPerTensorQuantized()) {
+  if (input.IsPerTensorQuantized()) {
     DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        abs, input, output)
-  } else {
-    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
-                            input.tensor_element_type(), abs, input, output);
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), abs, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/abs_test.cc b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
index 66972cabe9a6bf..0e3962825c56d6 100644
--- a/tensorflow/lite/experimental/shlo/ops/abs_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/abs_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/shlo/ops/abs.h"
 
-#include <cstddef>
-#include <cstdint>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
@@ -30,6 +30,11 @@ using testing::ElementsAreArray;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<AbsOp> {
+  static std::string Get() { return "Abs"; }
+};
+
 namespace {
 
 constexpr struct AbsRef {
@@ -39,12 +44,25 @@ constexpr struct AbsRef {
   }
 } abs_ref;
 
+INSTANTIATE_TYPED_TEST_SUITE_P(Abs, UnaryElementwiseOpShapePropagationTest,
+                               AbsOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Abs, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<AbsOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<AbsOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Abs, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
 template <class T>
 struct AbsTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(AbsTest, NonQuantizedTestTypes, TestParamNames);
+TYPED_TEST_SUITE(AbsTest, ArithmeticTestTypes, TestParamNames);
 
-TYPED_TEST(AbsTest, NonQuantized) {
+TYPED_TEST(AbsTest, ArithmeticTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -107,64 +125,5 @@ TYPED_TEST(QuantizedAbsTest, QuantizedPerTensor) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedAbsTest, QuantizedPerAxis) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  const size_t rank = shape.Rank();
-  const Axis quantized_dimension_size = shape.Dim(quantized_dimension);
-  const size_t quantization_stride = [&] {
-    size_t res = 1;
-    for (int64_t i = rank - 1; i > quantized_dimension; --i) {
-      res *= shape.Dim(i);
-    }
-    return res;
-  }();
-  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
-  Vector<StorageT> zero_points_data = RandomBuffer<TypeParam::kStorage>(
-      /*shape=*/Shape({shape.Dim(2)}), /*min=*/static_cast<StorageT>(-5),
-      /*max=*/static_cast<StorageT>(5));
-  Vector<ExpressedT> scales_data = RandomBuffer<TypeParam::kExpressed>(
-      /*shape=*/Shape({shape.Dim(2)}), /*min=*/static_cast<ExpressedT>(1),
-      /*max=*/static_cast<ExpressedT>(3));
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          scales_data, zero_points_data, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = input_data.data()};
-  Tensor output_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = output_data.data()};
-
-  Vector<StorageT> expected_data(shape.NumElements());
-  absl::c_transform(
-      input_data, expected_data.begin(),
-      [&, element_index = 0ull, quantization_index = 0ull](auto v) mutable {
-        const StorageT zero_point = zero_points_data[quantization_index];
-        const ExpressedT scale = scales_data[quantization_index];
-
-        if (++element_index >= quantization_stride) {
-          element_index = 0;
-          if (++quantization_index >= quantized_dimension_size) {
-            quantization_index = 0;
-          }
-        }
-        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
-        const ExpressedT dequantized_res = abs_ref(dequantized_input);
-        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
-            dequantized_res, zero_point, ExpressedT(1) / scale);
-      });
-
-  auto op = Create(AbsOp::Attributes{});
-  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
-  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
-  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/and.cc b/tensorflow/lite/experimental/shlo/ops/and.cc
new file mode 100644
index 00000000000000..844690fc194175
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and.cc
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/and.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct And : std::bit_and<void> {};
+
+template <>
+struct And<DataType::kI1> : std::logical_and<void> {};
+
+AndOp Create(AndOp::Attributes) { return {}; }
+
+absl::Status Prepare(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("and"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("and"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("and"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    And<DataType::kSI32> and_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 and_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    And<DataType::kI1> and_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(and_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.and: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/and.h b/tensorflow/lite/experimental/shlo/ops/and.h
new file mode 100644
index 00000000000000..5a17f42c9df46a
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct AndOp {
+  struct Attributes {};
+};
+
+AndOp Create(AndOp::Attributes);
+absl::Status Prepare(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(AndOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_AND_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/and_test.cc b/tensorflow/lite/experimental/shlo/ops/and_test.cc
new file mode 100644
index 00000000000000..7b22155f7434af
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/and_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/and.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<AndOp> {
+  static std::string Get() { return "And"; }
+};
+
+template <DataType>
+struct And : std::bit_and<void> {};
+
+template <>
+struct And<DataType::kI1> : std::logical_and<void> {};
+
+template <>
+struct SupportedOpDataType<AndOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(And, BinaryElementwiseOpShapePropagationTest,
+                               AndOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    AndOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    And, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<AndOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(And, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct AndTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(AndTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(AndTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    And<TypeParam::kStorage>());
+
+  auto op = Create(AndOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
new file mode 100644
index 00000000000000..94d4c0c82b02b8
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+namespace detail {
+
+template <DataType storage_type, DataType expressed_type, typename F>
+void DequantizeOpQuantizePerTensor(F&& func, const Tensor& lhs,
+                                   const Tensor& rhs, Tensor& output) {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+  const DimensionSize num_elements = lhs.NumElements();
+  const StorageT lhs_zero_point =
+      lhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT lhs_scale =
+      lhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT rhs_zero_point =
+      rhs.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT rhs_scale =
+      rhs.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT output_zero_point =
+      output.quantized_tensor_element_type().ZeroPoints<storage_type>()[0];
+  const ExpressedT output_scale =
+      output.quantized_tensor_element_type().Scales<expressed_type>()[0];
+  const StorageT* lhs_data = lhs.GetDataAs<storage_type>();
+  const StorageT* rhs_data = rhs.GetDataAs<storage_type>();
+  StorageT* output_data = output.GetDataAs<storage_type>();
+  const ExpressedT inv_scale = static_cast<ExpressedT>(1 / output_scale);
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++lhs_data, ++rhs_data, ++output_data) {
+    const ExpressedT dequantized_lhs =
+        Dequantize(*lhs_data, lhs_zero_point, lhs_scale);
+    const ExpressedT dequantized_rhs =
+        Dequantize(*rhs_data, rhs_zero_point, rhs_scale);
+    const ExpressedT dequantized_res = func(dequantized_lhs, dequantized_rhs);
+    *output_data = Quantize<storage_type, expressed_type>(
+        dequantized_res, output_zero_point, inv_scale);
+  }
+}
+
+template <DataType data_type, class F>
+void EvaluateNoQuantization(F&& func, const Tensor& lhs, const Tensor& rhs,
+                            Tensor& output) {
+  using T = StorageType<data_type>;
+  const T* lhs_data = lhs.GetDataAs<data_type>();
+  const T* rhs_data = rhs.GetDataAs<data_type>();
+  T* output_data = output.GetDataAs<data_type>();
+  const DimensionSize num_elements = lhs.NumElements();
+  for (DimensionSize i = 0; i < num_elements;
+       ++i, ++output_data, ++lhs_data, ++rhs_data) {
+    *output_data = func(*lhs_data, *rhs_data);
+  }
+}
+
+}  // namespace detail
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
new file mode 100644
index 00000000000000..5c2904b229ef66
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+
+namespace shlo_ref {
+namespace {
+
+struct TestOp {
+  template <typename T>
+  T operator()(const T& lhs, const T& rhs) {
+    return lhs + rhs;
+  }
+};
+
+template <class T>
+struct EvaluateNoQuantizationTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(EvaluateNoQuantizationTest, ArithmeticTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(EvaluateNoQuantizationTest, ArithmeticTensorsWithTestOp) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), TestOp());
+
+  detail::EvaluateNoQuantization<TypeParam::kStorage>(
+      TestOp(), lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+template <class T>
+struct DequantizeOpQuantizePerTensor : ::testing::Test {};
+
+TYPED_TEST_SUITE(DequantizeOpQuantizePerTensor, QuantizedTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(DequantizeOpQuantizePerTensor, QuantizedPerTensorWithTestOp) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT lhs_scale = static_cast<ExpressedT>(1.3);
+  const StorageT lhs_zero_point = static_cast<StorageT>(4);
+  const ExpressedT rhs_scale = static_cast<ExpressedT>(1.2);
+  const StorageT rhs_zero_point = static_cast<StorageT>(5);
+  const ExpressedT output_scale = static_cast<ExpressedT>(1.5);
+  const StorageT output_zero_point = static_cast<StorageT>(3);
+  Tensor lhs_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      lhs_scale, lhs_zero_point)},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      rhs_scale, rhs_zero_point)},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type =
+          QuantizedTensorType{
+              .shape = shape,
+              .element_type =
+                  QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                                        TypeParam::kExpressed>(
+                      output_scale, output_zero_point)},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [lhs_zero_point, lhs_scale, rhs_zero_point, rhs_scale, output_zero_point,
+       output_scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs =
+            Dequantize(lhs, lhs_zero_point, lhs_scale);
+        const ExpressedT dequantized_rhs =
+            Dequantize(rhs, rhs_zero_point, rhs_scale);
+        const ExpressedT dequantized_res =
+            TestOp()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, output_zero_point,
+            static_cast<ExpressedT>(1.) / output_scale);
+      });
+
+  detail::DequantizeOpQuantizePerTensor<TypeParam::kStorage,
+                                        TypeParam::kExpressed>(
+      TestOp(), lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
new file mode 100644
index 00000000000000..fccd0a5c510bea
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h
@@ -0,0 +1,200 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <class Op, class List>
+struct OpTuple;
+
+template <class Op, class... Ts>
+struct OpTuple<Op, ::testing::Types<Ts...>> {
+  using Type = std::tuple<Op, Ts...>;
+};
+
+template <class Op>
+struct OpTupleFactory {
+  template <class T>
+  using WithOp = typename OpTuple<Op, T>::Type;
+};
+
+template <class Op, class SupportedTypes>
+using BinaryElementwiseBaselineConstraintTypes =
+    MapTypes<OpTupleFactory<Op>::template WithOp,
+             FilterTypes<NegatePred<SameTypes>::template Predicate,
+                         CrossProductTypes<SupportedTypes, SupportedTypes,
+                                           SupportedTypes>>>;
+
+using BaselineConstraintIntTypes = ::testing::Types<TestParam<DataType::kSI32>>;
+
+using BaselineConstraintFloatTypes =
+    ::testing::Types<TestParam<DataType::kF32>>;
+
+using BaselineConstraintQuantizedPerTensorTypes =
+    ::testing::Types<PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+                     PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>;
+
+template <class Op>
+class BinaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetRhsShape(Shape shape) { rhs_tensor_.shape() = std::move(shape); }
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool LhsAndOutputShapesAreEqual() const {
+    return lhs_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(typename Op::Attributes{});
+  Tensor lhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor rhs_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+                    this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->lhs_tensor_.shape()));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+TYPED_TEST_P(BinaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->LhsAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->lhs_tensor_, this->rhs_tensor_,
+              this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shapes."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class BinaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(BinaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using LhsTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using RhsTypeDesc = std::tuple_element_t<2, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<3, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor lhs_tensor{.type = TensorTypeFor(LhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor rhs_tensor{.type = TensorTypeFor(RhsTypeDesc{}, shape),
+                    .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, lhs_tensor, rhs_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class BinaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(BinaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status =
+      Prepare(op, input_tensor, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(BinaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_BINARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt.cc b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
index 2a526292829363..076e5175436a5e 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
-#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
@@ -33,36 +32,26 @@ struct Cbrt {
   T operator()(T v) const {
     return std::cbrt(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cbrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Cbrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CbrtOp Create(CbrtOp::Attributes) { return {}; }
 
 absl::Status Prepare(CbrtOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cbrt constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cbrt"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cbrt"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +62,11 @@ absl::Status Evaluate(CbrtOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        cbrt, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cbrt, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
index 1c8ae75845c0bb..b62edcfebbfb7c 100644
--- a/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cbrt_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/cbrt.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CbrtOp> {
+  static std::string Get() { return "Cbrt"; }
+};
+
 namespace {
 
 struct Cbrt {
@@ -43,48 +48,37 @@ struct Cbrt {
   T operator()(T v) const {
     return std::cbrt(v);
   }
-
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
-
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
 } cbrt_ref;
 
-template <class T>
-struct NonQuantizedIntCbrtTest : ::testing::Test {};
+template <>
+F16 Cbrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST_SUITE(NonQuantizedIntCbrtTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+template <>
+BF16 Cbrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST(NonQuantizedIntCbrtTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+INSTANTIATE_TYPED_TEST_SUITE_P(Cbrt, UnaryElementwiseOpShapePropagationTest,
+                               CbrtOp, TestParamNames);
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Cbrt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CbrtOp>, TestParamNames);
 
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
+using UnsupportedTypes = WithOpTypes<
+    CbrtOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
 
-  auto op = Create(CbrtOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Cbrt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCbrtTest : ::testing::Test {};
+struct CbrtTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCbrtTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CbrtTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCbrtTest, FloatTensorsWork) {
+TYPED_TEST(CbrtTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +141,5 @@ TYPED_TEST(QuantizedCbrtTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCbrtTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CbrtOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil.cc b/tensorflow/lite/experimental/shlo/ops/ceil.cc
index 5a506dc12923af..95f96a38bafc4a 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil.cc
@@ -33,36 +33,26 @@ struct Ceil {
   T operator()(T v) const {
     return std::ceil(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Ceil::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Ceil::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CeilOp Create(CeilOp::Attributes) { return {}; }
 
 absl::Status Prepare(CeilOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.ceil constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("ceil"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("ceil"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +63,11 @@ absl::Status Evaluate(CeilOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        ceil, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    ceil, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
index 0875a02435e941..2e3e6288c14f4f 100644
--- a/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/ceil_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/ceil.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CeilOp> {
+  static std::string Get() { return "Ceil"; }
+};
+
 namespace {
 
 struct Ceil {
@@ -43,48 +48,37 @@ struct Ceil {
   T operator()(T v) const {
     return std::ceil(v);
   }
-
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
-
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
 } ceil_ref;
 
-template <class T>
-struct NonQuantizedIntCeilTest : ::testing::Test {};
+template <>
+F16 Ceil::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST_SUITE(NonQuantizedIntCeilTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+template <>
+BF16 Ceil::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST(NonQuantizedIntCeilTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+INSTANTIATE_TYPED_TEST_SUITE_P(Ceil, UnaryElementwiseOpShapePropagationTest,
+                               CeilOp, TestParamNames);
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Ceil, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CeilOp>, TestParamNames);
 
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
+using UnsupportedTypes = WithOpTypes<
+    CeilOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
 
-  auto op = Create(CeilOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Ceil, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCeilTest : ::testing::Test {};
+struct CeilTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCeilTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CeilTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCeilTest, FloatTensorsWork) {
+TYPED_TEST(CeilTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +141,5 @@ TYPED_TEST(QuantizedCeilTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCeilTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CeilOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine.cc b/tensorflow/lite/experimental/shlo/ops/cosine.cc
index e373708c15f369..5bd347836af75e 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
-#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/dispatch.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
@@ -33,36 +32,26 @@ struct Cosine {
   T operator()(T v) const {
     return std::cos(v);
   }
+};
 
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
+template <>
+F16 Cosine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
-};
+template <>
+BF16 Cosine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
 CosineOp Create(CosineOp::Attributes) { return {}; }
 
 absl::Status Prepare(CosineOp& op, const Tensor& input, Tensor& output) {
   SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
-  if (!input.IsQuantized() && IsInteger(input.StorageType())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine does not support integer tensor types.");
-  }
-  if (input.IsPerAxisQuantized()) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine does not support per axis quantization.");
-  }
-  if (BaselineType(input.element_type()) !=
-      BaselineType(output.element_type())) {
-    return absl::FailedPreconditionError(
-        "stablehlo.cosine constraint (C1) is not satisfied (incompatible "
-        "baseline types).");
-  }
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cosine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cosine"), input, output));
   return absl::OkStatus();
 }
 
@@ -73,11 +62,11 @@ absl::Status Evaluate(CosineOp& op, const Tensor& input, Tensor& output) {
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        cosine, input, output)
-  } else {
+  } else if (IsFloatTensor(input)) {
     DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
                    cosine, input, output);
   }
-  return absl::OkStatus();
+  return absl::FailedPreconditionError("Unsupported tensor type.");
 }
 
 };  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
index 7eb8901cbe2aff..9f858c9d6cc666 100644
--- a/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/cosine_test.cc
@@ -16,26 +16,31 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/cosine.h"
 
 #include <cmath>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
 #include "tensorflow/lite/experimental/shlo/quantize.h"
 #include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
 
-using shlo_ref::testing::StatusIs;
 using testing::ElementsAreArray;
 using testing::NanSensitiveFloatEq;
 using testing::Pointwise;
 
 namespace shlo_ref {
 
+template <>
+struct ParamName<CosineOp> {
+  static std::string Get() { return "Cosine"; }
+};
+
 namespace {
 
 struct Cosine {
@@ -43,48 +48,38 @@ struct Cosine {
   T operator()(T v) const {
     return std::cos(v);
   }
-
-  template <>
-  F16 operator()<F16>(F16 val) const {
-    return F16(operator()(static_cast<float>(val)));
-  }
-
-  template <>
-  BF16 operator()<BF16>(BF16 val) const {
-    return BF16(operator()(static_cast<float>(val)));
-  }
 } cosine_ref;
 
-template <class T>
-struct NonQuantizedIntCosineTest : ::testing::Test {};
+template <>
+F16 Cosine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST_SUITE(NonQuantizedIntCosineTest, NonQuantizedIntTestTypes,
-                 TestParamNames);
+template <>
+BF16 Cosine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
 
-TYPED_TEST(NonQuantizedIntCosineTest, IntTensorsRaiseAnError) {
-  using StorageT = typename TypeParam::StorageT;
+INSTANTIATE_TYPED_TEST_SUITE_P(Cosine, UnaryElementwiseOpShapePropagationTest,
+                               CosineOp, TestParamNames);
 
-  const Shape shape({2, 3, 4});
-  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
-  Vector<StorageT> output_data(shape.NumElements());
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Cosine, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<CosineOp>, TestParamNames);
 
-  Tensor input_tensor{
-      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
+using UnsupportedTypes =
+    WithOpTypes<CosineOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                      PerAxisQuantizedTestTypes>>;
 
-  auto op = Create(CosineOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
+INSTANTIATE_TYPED_TEST_SUITE_P(Cosine, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
 
 template <class T>
-struct NonQuantizedCosineTest : ::testing::Test {};
+struct CosineTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(NonQuantizedCosineTest, NonQuantizedFloatTestTypes,
-                 TestParamNames);
+TYPED_TEST_SUITE(CosineTest, FloatTestTypes, TestParamNames);
 
-TYPED_TEST(NonQuantizedCosineTest, FloatTensorsWork) {
+TYPED_TEST(CosineTest, FloatTensorsWork) {
   using StorageT = typename TypeParam::StorageT;
 
   const Shape shape({2, 3, 4});
@@ -147,27 +142,5 @@ TYPED_TEST(QuantizedCosineTest, PerTensorWorks) {
   EXPECT_THAT(output_data, ElementsAreArray(expected_data));
 }
 
-TYPED_TEST(QuantizedCosineTest, PerAxisFails) {
-  using StorageT = typename TypeParam::StorageT;
-  using ExpressedT = typename TypeParam::ExpressedT;
-
-  const Shape shape({4, 3, 2});
-  const int quantized_dimension = 2;
-  Vector<ExpressedT> empty_scales;
-  Vector<StorageT> empty_zero_points;
-  const QuantizedTensorElementType tensor_type =
-      QuantizedTensorElementType::PerAxis<TypeParam::kStorage,
-                                          TypeParam::kExpressed>(
-          empty_scales, empty_zero_points, quantized_dimension);
-  Tensor input_tensor{
-      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
-      .data = nullptr};
-  Tensor output_tensor = input_tensor;
-
-  auto op = Create(CosineOp::Attributes{});
-  EXPECT_THAT(Prepare(op, input_tensor, output_tensor),
-              StatusIs(absl::StatusCode::kFailedPrecondition));
-}
-
 }  // namespace
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
new file mode 100644
index 00000000000000..7e3f17f551093c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.cc
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h"
+
+#include <type_traits>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CountLeadingZeros {
+  template <class T>
+  T operator()(T v) const {
+    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+  }
+};
+
+CountLeadingZerosOp Create(CountLeadingZerosOp::Attributes) { return {}; }
+
+absl::Status Prepare(CountLeadingZerosOp& op, const Tensor& input,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("count_leading_zeros"), input, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("count_leading_zeros"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(CountLeadingZerosOp& op, const Tensor& input,
+                      Tensor& output) {
+  CountLeadingZeros count_leading_zeros;
+  if (IsIntTensor(input)) {
+    DISPATCH_INT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                 count_leading_zeros, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.count_leading_zeros: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
new file mode 100644
index 00000000000000..77b0de2dd7bcda
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct CountLeadingZerosOp {
+  struct Attributes {};
+};
+
+CountLeadingZerosOp Create(CountLeadingZerosOp::Attributes);
+absl::Status Prepare(CountLeadingZerosOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(CountLeadingZerosOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_COUNT_LEADING_ZEROS_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
new file mode 100644
index 00000000000000..4780d1c4fb4b87
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/count_leading_zeros_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/count_leading_zeros.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<CountLeadingZerosOp> {
+  static std::string Get() { return "CountLeadingZeros"; }
+};
+
+template <>
+struct SupportedOpDataType<CountLeadingZerosOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct CountLeadingZeros {
+  template <class T>
+  T operator()(T v) const {
+    return absl::countl_zero(static_cast<std::make_unsigned_t<T>>(v));
+  }
+} count_leading_zeros_ref;
+
+template <class T>
+struct CountLeadingZerosFunctorTest : ::testing::Test {};
+
+using CountLeadingZerosTypes = ::testing::Types<int32_t, int16_t, int8_t>;
+
+TYPED_TEST_SUITE(CountLeadingZerosFunctorTest, CountLeadingZerosTypes);
+
+TYPED_TEST(CountLeadingZerosFunctorTest, GivesCorrectResults) {
+  constexpr TypeParam byte_count = 8 * sizeof(TypeParam);
+  EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::lowest()),
+            0);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(-1)), 0);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(0)), byte_count);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(1)), byte_count - 1);
+  EXPECT_EQ(count_leading_zeros_ref(static_cast<TypeParam>(2)), byte_count - 2);
+  EXPECT_EQ(count_leading_zeros_ref(std::numeric_limits<TypeParam>::max()), 1);
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CountLeadingZeros,
+                               UnaryElementwiseOpShapePropagationTest,
+                               CountLeadingZerosOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    CountLeadingZeros, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<CountLeadingZerosOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<CountLeadingZerosOp, ConcatTypes<BoolTestType, FloatTestTypes,
+                                                 PerTensorQuantizedTestTypes,
+                                                 PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(CountLeadingZeros,
+                               UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct CountLeadingZerosTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(CountLeadingZerosTest, IntTestTypes, TestParamNames);
+
+TYPED_TEST(CountLeadingZerosTest, IntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape, -12);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), count_leading_zeros_ref);
+
+  auto op = Create(CountLeadingZerosOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/divide.cc b/tensorflow/lite/experimental/shlo/ops/divide.cc
new file mode 100644
index 00000000000000..420c808e6ab310
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/divide.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Divide : std::divides<void> {};
+
+DivideOp Create(DivideOp::Attributes) { return {}; }
+
+absl::Status Prepare(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("divide"), lhs,
+                                               IsIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("divide"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("divide"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Divide divide;
+  if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), divide, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       divide, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.divide: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/divide.h b/tensorflow/lite/experimental/shlo/ops/divide.h
new file mode 100644
index 00000000000000..a1a16c1b3545ad
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct DivideOp {
+  struct Attributes {};
+};
+
+DivideOp Create(DivideOp::Attributes);
+absl::Status Prepare(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(DivideOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_DIVIDE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/divide_test.cc b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
new file mode 100644
index 00000000000000..c83d02ca3e6569
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/divide_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/divide.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<DivideOp> {
+  static std::string Get() { return "Divide"; }
+};
+
+struct Divide : std::divides<void> {};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Divide, BinaryElementwiseOpShapePropagationTest,
+                               DivideOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    DivideOp,
+    ConcatTypes<BaselineConstraintIntTypes, BaselineConstraintFloatTypes,
+                BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Divide, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<DivideOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Divide, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<ArithmeticTestTypes>;
+
+template <class T>
+struct DivideTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(DivideTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(DivideTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Divide());
+
+  auto op = Create(DivideOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedDivideTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedDivideTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedDivideTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Divide()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(DivideOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.cc b/tensorflow/lite/experimental/shlo/ops/exponential.cc
new file mode 100644
index 00000000000000..979ddef45889ff
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Exponential {
+  template <class T>
+  T operator()(T v) const {
+    return std::exp(v);
+  }
+};
+
+template <>
+F16 Exponential::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Exponential::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+ExponentialOp Create(ExponentialOp::Attributes) { return {}; }
+
+absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("cosine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("cosine"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output) {
+  Exponential exponential;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       exponential, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   exponential, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.tanh: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential.h b/tensorflow/lite/experimental/shlo/ops/exponential.h
new file mode 100644
index 00000000000000..6fe19227009cfc
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialOp {
+  struct Attributes {};
+};
+
+ExponentialOp Create(ExponentialOp::Attributes);
+absl::Status Prepare(ExponentialOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(ExponentialOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
new file mode 100644
index 00000000000000..a5bcab04280ba1
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.cc
@@ -0,0 +1,76 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialMinusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::expm1(v);
+  }
+};
+
+template <>
+F16 ExponentialMinusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 ExponentialMinusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes) { return {}; }
+
+absl::Status Prepare(ExponentialMinusOneOp& op, const Tensor& input,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("exponential_minus_one"), input,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("exponential_minus_one"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
+                      Tensor& output) {
+  ExponentialMinusOne exponential_minus_one;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       exponential_minus_one, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   exponential_minus_one, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.exponential_minus_one: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
new file mode 100644
index 00000000000000..9c4fa9bcadaa1b
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct ExponentialMinusOneOp {
+  struct Attributes {};
+};
+
+ExponentialMinusOneOp Create(ExponentialMinusOneOp::Attributes);
+absl::Status Prepare(ExponentialMinusOneOp& op, const Tensor& input,
+                     Tensor& output);
+absl::Status Evaluate(ExponentialMinusOneOp& op, const Tensor& input,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_EXPONENTIAL_MINUS_ONE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
new file mode 100644
index 00000000000000..0fbe259ecefafc
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_minus_one_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential_minus_one.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<ExponentialMinusOneOp> {
+  static std::string Get() { return "ExponentialMinusOne"; }
+};
+
+namespace {
+
+struct ExponentialMinusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::expm1(v);
+  }
+} exponential_minus_one_ref;
+
+template <>
+F16 ExponentialMinusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 ExponentialMinusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(ExponentialMinusOne,
+                               UnaryElementwiseOpShapePropagationTest,
+                               ExponentialMinusOneOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    ExponentialMinusOne, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<ExponentialMinusOneOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<ExponentialMinusOneOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(ExponentialMinusOneOp,
+                               UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct ExponentialMinusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(ExponentialMinusOneTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(ExponentialMinusOneTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(),
+                    exponential_minus_one_ref);
+
+  auto op = Create(ExponentialMinusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedExponentialMinusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedExponentialMinusOneTest, QuantizedTestTypes,
+                 TestParamNames);
+
+TYPED_TEST(QuantizedExponentialMinusOneTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res =
+            exponential_minus_one_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(ExponentialMinusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/exponential_test.cc b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
new file mode 100644
index 00000000000000..f8cab0a7afc137
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/exponential_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/exponential.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<ExponentialOp> {
+  static std::string Get() { return "Exponential"; }
+};
+
+namespace {
+
+struct Exponential {
+  template <class T>
+  T operator()(T v) const {
+    return std::exp(v);
+  }
+} exponential_ref;
+
+template <>
+F16 Exponential::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Exponential::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Exponential,
+                               UnaryElementwiseOpShapePropagationTest,
+                               ExponentialOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Exponential, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<ExponentialOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<ExponentialOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                           PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Exponential, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct ExponentialTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(ExponentialTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(ExponentialTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), exponential_ref);
+
+  auto op = Create(ExponentialOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedExponentialTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedExponentialTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedExponentialTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = exponential_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(ExponentialOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.cc b/tensorflow/lite/experimental/shlo/ops/floor.cc
new file mode 100644
index 00000000000000..7ef86a3cb53e93
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/floor.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Floor {
+  template <class T>
+  T operator()(T v) const {
+    return std::floor(v);
+  }
+};
+
+template <>
+F16 Floor::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Floor::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+FloorOp Create(FloorOp::Attributes) { return {}; }
+
+absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("floor"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("floor"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output) {
+  Floor floor;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       floor, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   floor, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.floor: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/floor.h b/tensorflow/lite/experimental/shlo/ops/floor.h
new file mode 100644
index 00000000000000..1a3f7c48c5c3e9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct FloorOp {
+  struct Attributes {};
+};
+
+FloorOp Create(FloorOp::Attributes);
+absl::Status Prepare(FloorOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(FloorOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_FLOOR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/floor_test.cc b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
new file mode 100644
index 00000000000000..bf0e19f0c10aeb
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/floor_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/floor.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<FloorOp> {
+  static std::string Get() { return "Floor"; }
+};
+
+namespace {
+
+struct Floor {
+  template <class T>
+  T operator()(T v) const {
+    return std::floor(v);
+  }
+} floor_ref;
+
+template <>
+F16 Floor::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Floor::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Floor, UnaryElementwiseOpShapePropagationTest,
+                               FloorOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Floor, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<FloorOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<FloorOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                     PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Floor, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloorTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloorTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), floor_ref);
+
+  auto op = Create(FloorOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedFloorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedFloorTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedFloorTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = floor_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(FloorOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite.cc b/tensorflow/lite/experimental/shlo/ops/is_finite.cc
index 36246104104e82..153bf8a6e3b777 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
-#include <utility>
 
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
@@ -81,23 +80,18 @@ IsFiniteOp Create(const IsFiniteOp::Attributes& attributes) {
   return IsFiniteOp();
 }
 
-absl::Status Prepare(IsFiniteOp& op, Tensor operand, Tensor result) {
-  if (absl::Status status = CheckParameters(operand, result); !status.ok()) {
-    return status;
-  }
-  op.operand = std::move(operand);
-  op.result = std::move(result);
-  return absl::OkStatus();
+absl::Status Prepare(IsFiniteOp& op, const Tensor& operand, Tensor& result) {
+  return CheckParameters(operand, result);
 }
 
-absl::Status Evaluate(IsFiniteOp& op) {
-  if (!op.operand.data) {
+absl::Status Evaluate(IsFiniteOp& op, const Tensor& operand, Tensor& result) {
+  if (!operand.data) {
     return absl::InvalidArgumentError("No operand.data");
   }
-  if (!op.result.data) {
+  if (!result.data) {
     return absl::InvalidArgumentError("No result.data");
   }
-  return EvaluateImpl(op.operand, op.result);
+  return EvaluateImpl(operand, result);
 }
 
 }  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite.h b/tensorflow/lite/experimental/shlo/ops/is_finite.h
index aaff0eaadce7cf..0000567bb75c60 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite.h
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite.h
@@ -23,13 +23,11 @@ namespace shlo_ref {
 
 struct IsFiniteOp {
   struct Attributes {};
-  Tensor operand;
-  Tensor result;
 };
 
 IsFiniteOp Create(const IsFiniteOp::Attributes& attributes);
-absl::Status Prepare(IsFiniteOp& op, Tensor operand, Tensor result);
-absl::Status Evaluate(IsFiniteOp& op);
+absl::Status Prepare(IsFiniteOp& op, const Tensor& operand, Tensor& result);
+absl::Status Evaluate(IsFiniteOp& op, const Tensor& operand, Tensor& result);
 
 }  // namespace shlo_ref
 
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
index d9a362f18008cf..46b95c86639f14 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_bench.cc
@@ -31,16 +31,15 @@ void BM_IsFinite(benchmark::State& state, DimensionSize num_elements,
                  const Tensor& operand) {
   IsFiniteOp op = Create(IsFiniteOp::Attributes{});
 
-  ABSL_CHECK_OK(
-      Prepare(op, operand,
-              Tensor{.type = TensorType{.shape = Shape{{num_elements}},
-                                        .element_type = DataType::kI1}}));
+  Tensor result = Tensor{.type = TensorType{.shape = Shape{{num_elements}},
+                                            .element_type = DataType::kI1}};
+  ABSL_CHECK_OK(Prepare(op, operand, result));
 
-  std::vector<std::byte> result_values(op.result.SizeInBytes());
-  op.result.data = result_values.data();
+  std::vector<std::byte> result_values(result.SizeInBytes());
+  result.data = result_values.data();
 
   for (auto _ : state) {
-    ABSL_CHECK_OK(Evaluate(op));
+    ABSL_CHECK_OK(Evaluate(op, operand, result));
   }
 }
 
diff --git a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
index 2493e570ab27dc..be5fbdbcf1817b 100644
--- a/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/is_finite_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/lite/experimental/shlo/bf16.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
 #include "tensorflow/lite/experimental/shlo/status_matcher.h"
 #include "tensorflow/lite/experimental/shlo/tensor.h"
@@ -45,15 +46,15 @@ TEST_P(IsFiniteTest, IsFinite) {
   const auto& params = GetParam();
 
   IsFiniteOp op = Create(IsFiniteOp::Attributes{});
+  Tensor result{.type = params.expected.tensor().type};
 
-  EXPECT_OK(Prepare(op, params.operand.tensor(),
-                    Tensor{.type = params.expected.tensor().type}));
+  ASSERT_OK(Prepare(op, params.operand.tensor(), result));
 
-  std::vector<std::byte> result_data(op.result.SizeInBytes());
-  op.result.data = result_data.data();
+  std::vector<std::byte> result_data(result.SizeInBytes());
+  result.data = result_data.data();
 
-  EXPECT_OK(Evaluate(op));
-  EXPECT_THAT(op.result, TensorEq(params.expected.tensor()));
+  EXPECT_OK(Evaluate(op, params.operand.tensor(), result));
+  EXPECT_THAT(result, TensorEq(params.expected.tensor()));
 }
 
 INSTANTIATE_TEST_SUITE_P(
@@ -65,11 +66,12 @@ INSTANTIATE_TEST_SUITE_P(
                     BF16{-1.0f}, BF16{0.0f}, BF16{1.0f}}),
                TensorWithData::Create<DataType::kI1>(
                    Shape{{7}}, {false, false, false, false, true, true, true})},
-        Params{TensorWithData::Create<DataType::kF16>(
-                   Shape{{7}},
-                   {+NAN, -NAN, -INFINITY, +INFINITY, -1.0f, 0.0f, 1.0f}),
-               TensorWithData::Create<DataType::kI1>(
-                   Shape{{7}}, {false, false, false, false, true, true, true})},
+        Params{
+            TensorWithData::Create<DataType::kF16>(
+                Shape{{7}}, {F16{+NAN}, F16{-NAN}, F16{-INFINITY},
+                             F16{+INFINITY}, F16{-1.0f}, F16{0.0f}, F16{1.0f}}),
+            TensorWithData::Create<DataType::kI1>(
+                Shape{{7}}, {false, false, false, false, true, true, true})},
         Params{
             TensorWithData::Create<DataType::kF32>(
                 Shape{{7}},
diff --git a/tensorflow/lite/experimental/shlo/ops/log.cc b/tensorflow/lite/experimental/shlo/ops/log.cc
new file mode 100644
index 00000000000000..9f3f68ae8e7fdf
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Log {
+  template <class T>
+  T operator()(T v) const {
+    return std::log(v);
+  }
+};
+
+template <>
+F16 Log::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Log::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+LogOp Create(LogOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("log"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("log"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output) {
+  Log log;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       log, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   log, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.log: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/log.h b/tensorflow/lite/experimental/shlo/ops/log.h
new file mode 100644
index 00000000000000..981cf23ab379d7
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogOp {
+  struct Attributes {};
+};
+
+LogOp Create(LogOp::Attributes);
+absl::Status Prepare(LogOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
new file mode 100644
index 00000000000000..f80ebcce54d9fa
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log_plus_one.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogPlusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::log1p(v);
+  }
+};
+
+template <>
+F16 LogPlusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 LogPlusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+LogPlusOneOp Create(LogPlusOneOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("log_plus_one"), input,
+                                               IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("log_plus_one"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output) {
+  LogPlusOne log_plus_one;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       log_plus_one, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   log_plus_one, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.log_plus_one: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one.h b/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
new file mode 100644
index 00000000000000..de06a981e8a59e
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogPlusOneOp {
+  struct Attributes {};
+};
+
+LogPlusOneOp Create(LogPlusOneOp::Attributes);
+absl::Status Prepare(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogPlusOneOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOG_PLUS_ONE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
new file mode 100644
index 00000000000000..9e303cc812d310
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_plus_one_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log_plus_one.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogPlusOneOp> {
+  static std::string Get() { return "LogPlusOne"; }
+};
+
+namespace {
+
+struct LogPlusOne {
+  template <class T>
+  T operator()(T v) const {
+    return std::log1p(v);
+  }
+} log_plus_one_ref;
+
+template <>
+F16 LogPlusOne::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 LogPlusOne::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(LogPlusOne,
+                               UnaryElementwiseOpShapePropagationTest,
+                               LogPlusOneOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    LogPlusOne, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogPlusOneOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<LogPlusOneOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                          PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(LogPlusOne, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogPlusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogPlusOneTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogPlusOneTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(-0.99));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), log_plus_one_ref);
+
+  auto op = Create(LogPlusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogPlusOneTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogPlusOneTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogPlusOneTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = log_plus_one_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogPlusOneOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/log_test.cc b/tensorflow/lite/experimental/shlo/ops/log_test.cc
new file mode 100644
index 00000000000000..5f2c59f147ee4e
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/log_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/log.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogOp> {
+  static std::string Get() { return "Log"; }
+};
+
+namespace {
+
+struct Log {
+  template <class T>
+  T operator()(T v) const {
+    return std::log(v);
+  }
+} log_ref;
+
+template <>
+F16 Log::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Log::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Log, UnaryElementwiseOpShapePropagationTest,
+                               LogOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Log, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    LogOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Log, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(0.1));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), log_ref);
+
+  auto op = Create(LogOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/zero_point + 1);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = log_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.cc b/tensorflow/lite/experimental/shlo/ops/logistic.cc
new file mode 100644
index 00000000000000..3953cfe3441810
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/logistic.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Logistic {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    return one / (one + std::exp(-v));
+  }
+};
+
+template <>
+F16 Logistic::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 Logistic::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+LogisticOp Create(LogisticOp::Attributes) { return {}; }
+
+absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("logistic"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("logistic"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output) {
+  Logistic logistic;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       logistic, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   logistic, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.logistic: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic.h b/tensorflow/lite/experimental/shlo/ops/logistic.h
new file mode 100644
index 00000000000000..b44ef6feb76e75
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct LogisticOp {
+  struct Attributes {};
+};
+
+LogisticOp Create(LogisticOp::Attributes);
+absl::Status Prepare(LogisticOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(LogisticOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_LOGISTIC_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/logistic_test.cc b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
new file mode 100644
index 00000000000000..3d8014e33d133a
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/logistic_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/logistic.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<LogisticOp> {
+  static std::string Get() { return "Logistic"; }
+};
+
+namespace {
+
+struct Logistic {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    return one / (one + std::exp(-v));
+  }
+} logistic_ref;
+
+template <>
+F16 Logistic::operator()(F16 v) const {
+  return F16(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 Logistic::operator()(BF16 v) const {
+  return BF16(operator()(static_cast<float>(v)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Logistic, UnaryElementwiseOpShapePropagationTest,
+                               LogisticOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Logistic, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<LogisticOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<LogisticOp, ConcatTypes<BoolTestType, IntTestTypes,
+                                        PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Logistic, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct LogisticTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(LogisticTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(LogisticTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), logistic_ref);
+
+  auto op = Create(LogisticOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedLogisticTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedLogisticTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedLogisticTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = logistic_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(LogisticOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum.cc b/tensorflow/lite/experimental/shlo/ops/maximum.cc
new file mode 100644
index 00000000000000..9239dadaac3f92
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum.cc
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/maximum.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Maximum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a > b ? a : b;
+  }
+};
+
+MaximumOp Create(MaximumOp::Attributes) { return {}; }
+
+absl::Status Prepare(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("maximum"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("maximum"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("maximum"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Maximum maximum;
+  if (IsBoolTensor(lhs) || IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
+                            lhs.tensor_element_type(), maximum, lhs, rhs,
+                            output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       maximum, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.maximum: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum.h b/tensorflow/lite/experimental/shlo/ops/maximum.h
new file mode 100644
index 00000000000000..1ce0be360542f9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MaximumOp {
+  struct Attributes {};
+};
+
+MaximumOp Create(MaximumOp::Attributes);
+absl::Status Prepare(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MaximumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MAXIMUM_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/maximum_test.cc b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
new file mode 100644
index 00000000000000..0422331324daf9
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/maximum_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/maximum.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MaximumOp> {
+  static std::string Get() { return "Maximum"; }
+};
+
+struct Maximum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a > b ? a : b;
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Maximum, BinaryElementwiseOpShapePropagationTest,
+                               MaximumOp, TestParamNames);
+
+using MaximumBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MaximumOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                           BaselineConstraintFloatTypes,
+                           BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Maximum, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MaximumBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<MaximumOp, ConcatTypes<PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Maximum, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MaximumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MaximumTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(MaximumTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Maximum());
+
+  auto op = Create(MaximumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMaximumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMaximumTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMaximumTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Maximum()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MaximumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum.cc b/tensorflow/lite/experimental/shlo/ops/minimum.cc
new file mode 100644
index 00000000000000..c583d7afb6b147
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum.cc
@@ -0,0 +1,66 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/minimum.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Minimum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a < b ? a : b;
+  }
+};
+
+MinimumOp Create(MinimumOp::Attributes) { return {}; }
+
+absl::Status Prepare(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("minimum"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("minimum"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("minimum"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Minimum minimum;
+  if (IsBoolTensor(lhs) || IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_BOOL_INT_FLOAT(detail::EvaluateNoQuantization,
+                            lhs.tensor_element_type(), minimum, lhs, rhs,
+                            output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       minimum, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.minimum: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum.h b/tensorflow/lite/experimental/shlo/ops/minimum.h
new file mode 100644
index 00000000000000..5fc2205566de9c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MinimumOp {
+  struct Attributes {};
+};
+
+MinimumOp Create(MinimumOp::Attributes);
+absl::Status Prepare(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MinimumOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MINIMUM_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/minimum_test.cc b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
new file mode 100644
index 00000000000000..586c096f1cbab0
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/minimum_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/minimum.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MinimumOp> {
+  static std::string Get() { return "Minimum"; }
+};
+
+struct Minimum {
+  template <class T>
+  constexpr auto operator()(const T a, const T b) {
+    return a < b ? a : b;
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Minimum, BinaryElementwiseOpShapePropagationTest,
+                               MinimumOp, TestParamNames);
+
+using MinimumBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MinimumOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                           BaselineConstraintFloatTypes,
+                           BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Minimum, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MinimumBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<MinimumOp, ConcatTypes<PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Minimum, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MinimumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MinimumTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(MinimumTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Minimum());
+
+  auto op = Create(MinimumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMinimumTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMinimumTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMinimumTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/zero_point + 1, /*max=*/zero_point + 5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Minimum()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MinimumOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply.cc b/tensorflow/lite/experimental/shlo/ops/multiply.cc
new file mode 100644
index 00000000000000..03898ff46367bd
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply.cc
@@ -0,0 +1,77 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/multiply.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType expressed_type>
+struct Multiply : std::multiplies<void> {};
+
+template <>
+struct Multiply<DataType::kI1> {
+  template <class T>
+  T operator()(const T& lhs, const T& rhs) const {
+    return static_cast<T>(lhs && rhs);
+  }
+};
+
+MultiplyOp Create(MultiplyOp::Attributes) { return {}; }
+
+absl::Status Prepare(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("multiply"), lhs, IsBoolTensor, IsIntTensor,
+                          IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("multiply"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("multiply"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsBoolTensor(lhs)) {
+    detail::EvaluateNoQuantization<DataType::kI1>(Multiply<DataType::kI1>(),
+                                                  lhs, rhs, output);
+    return absl::OkStatus();
+  } else if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    Multiply<DataType::kF32> multiply;
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), multiply, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    Multiply<DataType::kF32> multiply;
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       multiply, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.multiply: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply.h b/tensorflow/lite/experimental/shlo/ops/multiply.h
new file mode 100644
index 00000000000000..1301969b578788
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct MultiplyOp {
+  struct Attributes {};
+};
+
+MultiplyOp Create(MultiplyOp::Attributes);
+absl::Status Prepare(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(MultiplyOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_MULTIPLY_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/multiply_test.cc b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
new file mode 100644
index 00000000000000..241ed9214e2edb
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/multiply_test.cc
@@ -0,0 +1,159 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/multiply.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<MultiplyOp> {
+  static std::string Get() { return "Multiply"; }
+};
+
+template <DataType expressed_type>
+struct Multiply : std::multiplies<void> {};
+
+template <>
+struct Multiply<DataType::kI1> {
+  template <class T>
+  T operator()(const T& lhs, const T& rhs) const {
+    return static_cast<T>(lhs && rhs);
+  }
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Multiply,
+                               BinaryElementwiseOpShapePropagationTest,
+                               MultiplyOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    MultiplyOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes,
+                            BaselineConstraintFloatTypes,
+                            BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Multiply, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<MultiplyOp, PerAxisQuantizedTestTypes>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Multiply, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<BoolTestType, ArithmeticTestTypes>;
+
+template <class T>
+struct MultiplyTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(MultiplyTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(MultiplyTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Multiply<TypeParam::kStorage>());
+
+  auto op = Create(MultiplyOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedMultiplyTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedMultiplyTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedMultiplyTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Multiply<TypeParam::kExpressed>()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(MultiplyOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/negate.cc b/tensorflow/lite/experimental/shlo/ops/negate.cc
new file mode 100644
index 00000000000000..079d3faf0b21bb
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate.cc
@@ -0,0 +1,57 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/negate.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Negate : std::negate<void> {};
+
+NegateOp Create(NegateOp::Attributes) { return {}; }
+
+absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("negate"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("negate"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output) {
+  Negate negate;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       negate, input, output)
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), negate, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.negate: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/negate.h b/tensorflow/lite/experimental/shlo/ops/negate.h
new file mode 100644
index 00000000000000..07fc6329d1dbbe
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NegateOp {
+  struct Attributes {};
+};
+
+NegateOp Create(NegateOp::Attributes);
+absl::Status Prepare(NegateOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NegateOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NEGATE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/negate_test.cc b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
new file mode 100644
index 00000000000000..4786dfe813cbca
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/negate_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/negate.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<NegateOp> {
+  static std::string Get() { return "Negate"; }
+};
+
+namespace {
+
+struct Negate : std::negate<void> {
+} negate_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Negate, UnaryElementwiseOpShapePropagationTest,
+                               NegateOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Negate, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<NegateOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<NegateOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Negate, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct NegateTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(NegateTest, ArithmeticTestTypes, TestParamNames);
+
+TYPED_TEST(NegateTest, ArithmeticTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), negate_ref);
+
+  auto op = Create(NegateOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedNegateTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedNegateTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedNegateTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = negate_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(NegateOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/not.cc b/tensorflow/lite/experimental/shlo/ops/not.cc
new file mode 100644
index 00000000000000..b8d613309b9010
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not.cc
@@ -0,0 +1,59 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/not.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Not {
+  template <class T>
+  T operator()(T v) const {
+    return ~v;
+  }
+};
+
+template <>
+bool Not::operator()(bool v) const {
+  return !v;
+}
+
+NotOp Create(NotOp::Attributes) { return {}; }
+
+absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("not"), input, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("not"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(NotOp& op, const Tensor& input, Tensor& output) {
+  Not not_func;
+  if (IsIntTensor(input) || IsBoolTensor(input)) {
+    DISPATCH_BOOL_INT(detail::EvaluateNoQuantization,
+                      input.tensor_element_type(), not_func, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.not: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/not.h b/tensorflow/lite/experimental/shlo/ops/not.h
new file mode 100644
index 00000000000000..c05bc0ef0ae7a4
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct NotOp {
+  struct Attributes {};
+};
+
+NotOp Create(NotOp::Attributes);
+absl::Status Prepare(NotOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(NotOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_NOT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/not_test.cc b/tensorflow/lite/experimental/shlo/ops/not_test.cc
new file mode 100644
index 00000000000000..6a036867895f84
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/not_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/not.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<NotOp> {
+  static std::string Get() { return "Not"; }
+};
+
+template <>
+struct SupportedOpDataType<NotOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct Not {
+  template <class T>
+  T operator()(T v) const {
+    return ~v;
+  }
+} not_ref;
+
+template <>
+bool Not::operator()(bool v) const {
+  return !v;
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Not, UnaryElementwiseOpShapePropagationTest,
+                               NotOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Not, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<NotOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<NotOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Not, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct BoolAndIntNotTest : ::testing::Test {};
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+TYPED_TEST_SUITE(BoolAndIntNotTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(BoolAndIntNotTest, BoolAndIntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), not_ref);
+
+  auto op = Create(NotOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/or.cc b/tensorflow/lite/experimental/shlo/ops/or.cc
new file mode 100644
index 00000000000000..639bcc898afb63
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or.cc
@@ -0,0 +1,63 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/or.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct Or : std::bit_or<void> {};
+
+template <>
+struct Or<DataType::kI1> : std::logical_or<void> {};
+
+OrOp Create(OrOp::Attributes) { return {}; }
+
+absl::Status Prepare(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("or"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("or"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("or"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    Or<DataType::kSI32> or_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 or_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    Or<DataType::kI1> or_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(or_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.or: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/or.h b/tensorflow/lite/experimental/shlo/ops/or.h
new file mode 100644
index 00000000000000..f9201c9327acf8
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct OrOp {
+  struct Attributes {};
+};
+
+OrOp Create(OrOp::Attributes);
+absl::Status Prepare(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(OrOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_OR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/or_test.cc b/tensorflow/lite/experimental/shlo/ops/or_test.cc
new file mode 100644
index 00000000000000..b0fcba048d1a50
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/or_test.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions or
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/or.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<OrOp> {
+  static std::string Get() { return "Or"; }
+};
+
+template <DataType>
+struct Or : std::bit_or<void> {};
+
+template <>
+struct Or<DataType::kI1> : std::logical_or<void> {};
+
+template <>
+struct SupportedOpDataType<OrOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Or, BinaryElementwiseOpShapePropagationTest,
+                               OrOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    OrOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Or, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<OrOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                  PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Or, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct OrTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(OrTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(OrTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Or<TypeParam::kStorage>());
+
+  auto op = Create(OrOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt.cc b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
new file mode 100644
index 00000000000000..0de4f84e71fb4f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt.cc
@@ -0,0 +1,57 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
+
+#include <type_traits>
+
+#include "absl/numeric/bits.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Popcnt {
+  template <class T>
+  T operator()(T v) const {
+    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+  }
+};
+
+PopcntOp Create(PopcntOp::Attributes) { return {}; }
+
+absl::Status Prepare(PopcntOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("popcnt"), input, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("popcnt"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(PopcntOp& op, const Tensor& input, Tensor& output) {
+  Popcnt popcnt;
+  if (IsIntTensor(input)) {
+    DISPATCH_INT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                 popcnt, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.popcnt: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt.h b/tensorflow/lite/experimental/shlo/ops/popcnt.h
new file mode 100644
index 00000000000000..56f3c3b6f441df
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct PopcntOp {
+  struct Attributes {};
+};
+
+PopcntOp Create(PopcntOp::Attributes);
+absl::Status Prepare(PopcntOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(PopcntOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_POPCNT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
new file mode 100644
index 00000000000000..3be3bdb39ac4ff
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/popcnt_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/popcnt.h"
+
+#include <string>
+#include <type_traits>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/numeric/bits.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<PopcntOp> {
+  static std::string Get() { return "Popcnt"; }
+};
+
+template <>
+struct SupportedOpDataType<PopcntOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+struct Popcnt {
+  template <class T>
+  T operator()(T v) const {
+    return absl::popcount(static_cast<std::make_unsigned_t<T>>(v));
+  }
+} popcnt_ref;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Popcnt, UnaryElementwiseOpShapePropagationTest,
+                               PopcntOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Popcnt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    BaselineMismatchSignedIntegerTypes<PopcntOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<PopcntOp, ConcatTypes<BoolTestType, FloatTestTypes,
+                                      PerTensorQuantizedTestTypes,
+                                      PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Popcnt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct PopcntTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(PopcntTest, IntTestTypes, TestParamNames);
+
+TYPED_TEST(PopcntTest, IntTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = IotaBuffer<TypeParam::kStorage>(shape, -12);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), popcnt_ref);
+
+  auto op = Create(PopcntOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.cc b/tensorflow/lite/experimental/shlo/ops/sign.cc
new file mode 100644
index 00000000000000..197b87ba6bb3bc
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign.cc
@@ -0,0 +1,75 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sign.h"
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sign {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    constexpr T zero = static_cast<T>(0);
+    return v < zero ? -one : (v > zero ? one : v);
+  }
+};
+
+template <>
+F16 Sign::operator()(F16 v) const {
+  return static_cast<F16>(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 Sign::operator()(BF16 v) const {
+  return static_cast<BF16>(operator()(static_cast<float>(v)));
+}
+
+SignOp Create(SignOp::Attributes) { return {}; }
+
+absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("sign"), input,
+                                               IsSignedIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sign"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output) {
+  Sign sign;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sign, input, output)
+  } else if (IsSignedIntTensor(input) || IsFloatTensor(input)) {
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       input.tensor_element_type(), sign, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.sign: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sign.h b/tensorflow/lite/experimental/shlo/ops/sign.h
new file mode 100644
index 00000000000000..70bdb71ec4035b
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SignOp {
+  struct Attributes {};
+};
+
+SignOp Create(SignOp::Attributes);
+absl::Status Prepare(SignOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SignOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SIGN_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sign_test.cc b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
new file mode 100644
index 00000000000000..67ec823482036a
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sign_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sign.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SignOp> {
+  static std::string Get() { return "Sign"; }
+};
+
+namespace {
+
+struct Sign {
+  template <class T>
+  T operator()(T v) const {
+    constexpr T one = static_cast<T>(1);
+    constexpr T zero = static_cast<T>(0);
+    return v < zero ? -one : (v > zero ? one : v);
+  }
+} sign_ref;
+
+template <>
+F16 Sign::operator()(F16 v) const {
+  return static_cast<F16>(operator()(static_cast<float>(v)));
+}
+
+template <>
+BF16 Sign::operator()(BF16 v) const {
+  return static_cast<BF16>(operator()(static_cast<float>(v)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sign, UnaryElementwiseOpShapePropagationTest,
+                               SignOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sign, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SignOp>, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<SignOp, ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sign, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct SignTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(SignTest, ArithmeticTestTypes, TestParamNames);
+
+TYPED_TEST(SignTest, ArithmeticTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sign_ref);
+
+  auto op = Create(SignOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSignTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSignTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSignTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sign_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SignOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.cc b/tensorflow/lite/experimental/shlo/ops/sine.cc
new file mode 100644
index 00000000000000..e69d98e07c1517
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sine.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sine {
+  template <class T>
+  T operator()(T v) const {
+    return std::sin(v);
+  }
+};
+
+template <>
+F16 Sine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Sine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+SineOp Create(SineOp::Attributes) { return {}; }
+
+absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("sine"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sine"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output) {
+  Sine sine;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sine, input, output)
+  } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   sine, input, output);
+  }
+  return absl::FailedPreconditionError("Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sine.h b/tensorflow/lite/experimental/shlo/ops/sine.h
new file mode 100644
index 00000000000000..66ca9a0a55f8b3
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SineOp {
+  struct Attributes {};
+};
+
+SineOp Create(SineOp::Attributes);
+absl::Status Prepare(SineOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SineOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SINE_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sine_test.cc b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
new file mode 100644
index 00000000000000..fa16dee3b1d27f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sine_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sine.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SineOp> {
+  static std::string Get() { return "Sine"; }
+};
+
+namespace {
+
+struct Sine {
+  template <class T>
+  T operator()(T v) const {
+    return std::sin(v);
+  }
+} sine_ref;
+
+template <>
+F16 Sine::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Sine::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sine, UnaryElementwiseOpShapePropagationTest,
+                               SineOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sine, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SineOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    SineOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sine, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatSineTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatSineTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatSineTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const TensorType tensor_type =
+      TensorType{.shape = shape, .element_type = TypeParam::kStorage};
+  Tensor input_tensor{.type = tensor_type, .data = input_data.data()};
+  Tensor output_tensor{.type = tensor_type, .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sine_ref);
+
+  auto op = Create(SineOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSineTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSineTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSineTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorType tensor_type = {
+      .shape = shape,
+      .element_type = QuantizedTensorElementType::PerTensor<
+          TypeParam::kStorage, TypeParam::kExpressed>(scale, zero_point)};
+  Tensor input_tensor{.type = tensor_type, .data = input_data.data()};
+  Tensor output_tensor{.type = tensor_type, .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sine_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SineOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.cc b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
new file mode 100644
index 00000000000000..f34841fc9cb874
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.cc
@@ -0,0 +1,73 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sqrt.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Sqrt {
+  template <class T>
+  T operator()(T v) const {
+    return std::sqrt(v);
+  }
+};
+
+template <>
+F16 Sqrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Sqrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+SqrtOp Create(SqrtOp::Attributes) { return {}; }
+
+absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("sqrt"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("sqrt"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output) {
+  Sqrt sqrt;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       sqrt, input, output)
+  } else if (IsFloatTensor(input)) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   sqrt, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.sqrt: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt.h b/tensorflow/lite/experimental/shlo/ops/sqrt.h
new file mode 100644
index 00000000000000..6955efff8d5a42
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SqrtOp {
+  struct Attributes {};
+};
+
+SqrtOp Create(SqrtOp::Attributes);
+absl::Status Prepare(SqrtOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(SqrtOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SQRT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
new file mode 100644
index 00000000000000..937c871b93745f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/sqrt_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/sqrt.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SqrtOp> {
+  static std::string Get() { return "Sqrt"; }
+};
+
+namespace {
+
+struct Sqrt {
+  template <class T>
+  T operator()(T v) const {
+    return std::sqrt(v);
+  }
+
+} sqrt_ref;
+
+template <>
+F16 Sqrt::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Sqrt::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sqrt, UnaryElementwiseOpShapePropagationTest,
+                               SqrtOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Sqrt, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<SqrtOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    SqrtOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Sqrt, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatSqrtTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatSqrtTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatSqrtTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(0));
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), sqrt_ref);
+
+  auto op = Create(SqrtOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSqrtTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSqrtTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSqrtTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(
+      shape, /*min=*/static_cast<StorageT>(zero_point));
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = sqrt_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SqrtOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract.cc b/tensorflow/lite/experimental/shlo/ops/subtract.cc
new file mode 100644
index 00000000000000..0c097afaebb578
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/subtract.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Subtract : std::minus<void> {};
+
+SubtractOp Create(SubtractOp::Attributes) { return {}; }
+
+absl::Status Prepare(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(CheckCtx("subtract"), lhs,
+                                               IsIntTensor, IsFloatTensor,
+                                               IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("subtract"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("subtract"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  Subtract subtract;
+  if (IsIntTensor(lhs) || IsFloatTensor(lhs)) {
+    // Note: all the arithmetic types share the same implementation.
+    DISPATCH_INT_FLOAT(detail::EvaluateNoQuantization,
+                       lhs.tensor_element_type(), subtract, lhs, rhs, output);
+  } else if (IsQuantizedPerTensorTensor(lhs)) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       lhs.quantized_tensor_element_type().StorageType(),
+                       lhs.quantized_tensor_element_type().ExpressedType(),
+                       subtract, lhs, rhs, output)
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.subtract: Unsupported tensor type.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract.h b/tensorflow/lite/experimental/shlo/ops/subtract.h
new file mode 100644
index 00000000000000..f0a80dbef54350
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct SubtractOp {
+  struct Attributes {};
+};
+
+SubtractOp Create(SubtractOp::Attributes);
+absl::Status Prepare(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(SubtractOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_SUBTRACT_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/subtract_test.cc b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
new file mode 100644
index 00000000000000..ed350d79d72150
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/subtract_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/subtract.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<SubtractOp> {
+  static std::string Get() { return "Subtract"; }
+};
+
+struct Subtract : std::minus<void> {};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Subtract,
+                               BinaryElementwiseOpShapePropagationTest,
+                               SubtractOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    SubtractOp,
+    ConcatTypes<BaselineConstraintIntTypes, BaselineConstraintFloatTypes,
+                BaselineConstraintQuantizedPerTensorTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Subtract, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<SubtractOp,
+                ConcatTypes<BoolTestType, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Subtract, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using ArithmeticTypes = ConcatTypes<ArithmeticTestTypes>;
+
+template <class T>
+struct SubtractTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(SubtractTest, ArithmeticTypes, TestParamNames);
+
+TYPED_TEST(SubtractTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(), Subtract());
+
+  auto op = Create(SubtractOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedSubtractTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedSubtractTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedSubtractTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(2);
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-5, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor lhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      lhs_data, rhs_data, expected_data.begin(),
+      [zero_point, scale](auto lhs, auto rhs) {
+        const ExpressedT dequantized_lhs = Dequantize(lhs, zero_point, scale);
+        const ExpressedT dequantized_rhs = Dequantize(rhs, zero_point, scale);
+        const ExpressedT dequantized_res =
+            Subtract()(dequantized_lhs, dequantized_rhs);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(SubtractOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.cc b/tensorflow/lite/experimental/shlo/ops/tanh.cc
new file mode 100644
index 00000000000000..d2518f6ba81b5f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.cc
@@ -0,0 +1,74 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/tanh.h"
+
+#include <cmath>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct Tanh {
+  template <class T>
+  T operator()(T v) const {
+    return std::tanh(v);
+  }
+};
+
+template <>
+F16 Tanh::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Tanh::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+TanhOp Create(TanhOp::Attributes) { return {}; }
+
+absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(input.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(CheckSupportedTypes(
+      CheckCtx("tanh"), input, IsFloatTensor, IsQuantizedPerTensorTensor));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSameBaselineType(CheckCtx("tanh"), input, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output) {
+  Tanh tanh;
+  if (input.IsPerTensorQuantized()) {
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerTensor,
+                       input.quantized_tensor_element_type().StorageType(),
+                       input.quantized_tensor_element_type().ExpressedType(),
+                       tanh, input, output)
+  } else if (!input.IsQuantized() && IsFloat(input.StorageType())) {
+    DISPATCH_FLOAT(detail::EvaluateNoQuantization, input.tensor_element_type(),
+                   tanh, input, output);
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.tanh: Unsupported tensor type.");
+}
+
+};  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh.h b/tensorflow/lite/experimental/shlo/ops/tanh.h
new file mode 100644
index 00000000000000..fabf2b3ae0528c
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh.h
@@ -0,0 +1,34 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct TanhOp {
+  struct Attributes {};
+};
+
+TanhOp Create(TanhOp::Attributes);
+absl::Status Prepare(TanhOp& op, const Tensor& input, Tensor& output);
+absl::Status Evaluate(TanhOp& op, const Tensor& input, Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TANH_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/tanh_test.cc b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
new file mode 100644
index 00000000000000..0343a087b65a62
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/tanh_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/tanh.h"
+
+#include <cmath>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/quantize.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::ElementsAreArray;
+using testing::NanSensitiveFloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<TanhOp> {
+  static std::string Get() { return "Tanh"; }
+};
+
+namespace {
+
+struct Tanh {
+  template <class T>
+  T operator()(T v) const {
+    return std::tanh(v);
+  }
+} tanh_ref;
+
+template <>
+F16 Tanh::operator()<F16>(F16 val) const {
+  return F16(operator()(static_cast<float>(val)));
+}
+
+template <>
+BF16 Tanh::operator()<BF16>(BF16 val) const {
+  return BF16(operator()(static_cast<float>(val)));
+}
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Tanh, UnaryElementwiseOpShapePropagationTest,
+                               TanhOp, TestParamNames);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Tanh, UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    UnaryElementwiseConstraint1Types<TanhOp>, TestParamNames);
+
+using UnsupportedTypes = WithOpTypes<
+    TanhOp, ConcatTypes<BoolTestType, IntTestTypes, PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Tanh, UnaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+template <class T>
+struct FloatTanhTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(FloatTanhTest, FloatTestTypes, TestParamNames);
+
+TYPED_TEST(FloatTanhTest, FloatTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+
+  Tensor input_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(input_data, expected_data.begin(), tanh_ref);
+
+  auto op = Create(TanhOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(NanSensitiveFloatEq(), expected_data));
+}
+
+template <class T>
+struct QuantizedTanhTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(QuantizedTanhTest, QuantizedTestTypes, TestParamNames);
+
+TYPED_TEST(QuantizedTanhTest, PerTensorWorks) {
+  using StorageT = typename TypeParam::StorageT;
+  using ExpressedT = typename TypeParam::ExpressedT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> input_data = RandomBuffer<TypeParam::kStorage>(shape);
+  Vector<StorageT> output_data(shape.NumElements());
+  const ExpressedT scale = static_cast<ExpressedT>(1.5);
+  const StorageT zero_point = static_cast<StorageT>(5);
+  const QuantizedTensorElementType tensor_type =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorage,
+                                            TypeParam::kExpressed>(scale,
+                                                                   zero_point);
+  Tensor input_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = input_data.data()};
+  Tensor output_tensor{
+      .type = QuantizedTensorType{.shape = shape, .element_type = tensor_type},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(
+      input_data, expected_data.begin(), [zero_point, scale](auto v) {
+        const ExpressedT dequantized_input = Dequantize(v, zero_point, scale);
+        const ExpressedT dequantized_res = tanh_ref(dequantized_input);
+        return Quantize<TypeParam::kStorage, TypeParam::kExpressed>(
+            dequantized_res, zero_point, static_cast<ExpressedT>(1.) / scale);
+      });
+
+  auto op = Create(TanhOp::Attributes{});
+  ASSERT_OK(Prepare(op, input_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, input_tensor, output_tensor));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/test_util.h b/tensorflow/lite/experimental/shlo/ops/test_util.h
index 9b64d10c8455f0..58f35dcb8c446d 100644
--- a/tensorflow/lite/experimental/shlo/ops/test_util.h
+++ b/tensorflow/lite/experimental/shlo/ops/test_util.h
@@ -16,73 +16,115 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_TEST_UTIL_H_
 
+#include <cstdint>
 #include <random>
 #include <string>
+#include <tuple>
 #include <type_traits>
 
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
+// We use a vector class that is different from std::vector to have a consistent
+// API when dealing with bool tensors.
 template <class T>
 using Vector = absl::InlinedVector<T, 1>;
 
+// Helper for UniformDistribution.
 template <DataType storage_type, typename = void>
-struct Distribution;
+struct UniformDistributionImpl;
+
+template <>
+struct UniformDistributionImpl<DataType::kI1, void>
+    : std::uniform_int_distribution<int32_t> {
+  using std::uniform_int_distribution<int32_t>::uniform_int_distribution;
+};
 
 template <DataType storage_type>
-struct Distribution<storage_type, std::enable_if_t<IsInteger(storage_type)>>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsInteger(storage_type)>>
     : std::uniform_int_distribution<typename Storage<storage_type>::Type> {
   using std::uniform_int_distribution<
       typename Storage<storage_type>::Type>::uniform_int_distribution;
 };
 
 template <DataType storage_type>
-struct Distribution<storage_type, std::enable_if_t<IsFloat(storage_type)>>
+struct UniformDistributionImpl<storage_type,
+                               std::enable_if_t<IsFloat(storage_type)>>
     : std::uniform_real_distribution<float> {
   using std::uniform_real_distribution<float>::uniform_real_distribution;
 };
 
-template <DataType storage_type, class Config = Storage<storage_type>>
-Vector<typename Config::Type> RandomBuffer(
-    const Shape& shape, const typename Config::Type min = Config::kMinValue,
-    const typename Config::Type max = Config::kMaxValue) {
+// Helps creating a uniform distribution for the given data type.
+template <DataType storage_type>
+using UniformDistribution = UniformDistributionImpl<storage_type>;
+
+// Returns a vector filled with random data according to the set distribution.
+template <DataType storage_type,
+          template <DataType> class Distribution = UniformDistribution,
+          class MinT = StorageType<storage_type>,
+          class MaxT = StorageType<storage_type>,
+          class Config = Storage<storage_type>>
+Vector<typename Config::Type> RandomBuffer(const Shape& shape,
+                                           const MinT min = Config::kMinValue,
+                                           const MaxT max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
   Vector<typename Config::Type> vec(shape.NumElements());
   std::random_device rd;
-  Distribution<storage_type> dist(min, max);
+  Distribution<storage_type> dist(min_val, max_val);
   absl::c_generate(vec, [&] { return dist(rd); });
   return vec;
 }
 
+// Returns a vector filled with incremental value. The values wrap around
+// according to the storage type range.
 template <DataType storage_type, class Config = Storage<storage_type>>
 Vector<typename Config::Type> IotaBuffer(
     const Shape& shape, const typename Config::Type start = Config::kMinValue,
     const typename Config::Type min = Config::kMinValue,
     const typename Config::Type max = Config::kMaxValue) {
+  using StorageT = StorageType<storage_type>;
+  const StorageT min_val =
+      min > Config::kMinValue ? static_cast<StorageT>(min) : Config::kMinValue;
+  const StorageT max_val =
+      max < Config::kMaxValue ? static_cast<StorageT>(max) : Config::kMaxValue;
   Vector<typename Config::Type> vec(shape.NumElements());
-  auto v = start;
+  StorageT v = start >= min_val ? static_cast<StorageT>(start) : min_val;
+  v = v <= max_val ? v : min_val;
   for (auto& e : vec) {
     e = v;
-    if (++v > max) {
-      v = min;
+    if (v >= max_val) {
+      v = min_val;
+    } else {
+      ++v;
     }
   }
   return vec;
 }
 
+// Typed test parameter type.
 template <DataType... Types>
 struct TestParam;
 
+// Typed test parameter specialization for non quantized tensors.
 template <DataType storage_type>
 struct TestParam<storage_type> {
   static constexpr DataType kStorage = storage_type;
   using StorageT = StorageType<storage_type>;
 };
 
+// Typed test parameter specialization for quantized tensors.
 template <DataType storage_type, DataType expressed_type>
 struct TestParam<storage_type, expressed_type> {
   static constexpr DataType kStorage = storage_type;
@@ -91,6 +133,24 @@ struct TestParam<storage_type, expressed_type> {
   using ExpressedT = StorageType<expressed_type>;
 };
 
+// Typed test parameter tag to ask for a per-tensor quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
+template <class TestParamT>
+struct PerTensor {
+  using Param = TestParamT;
+};
+
+// Typed test parameter tag to ask for a per-channel quantized tensor.
+//
+// TestParamT should be a `TestParam<storage_type, expressed_type>`.
+template <class TestParamT, Axis kAxis = 0>
+struct PerAxis {
+  using Param = TestParamT;
+  static constexpr Axis axis = kAxis;
+};
+
+// Gets a string representation of the given DataType.
 constexpr const char* ToString(DataType t) {
   switch (t) {
     case DataType::kI1:
@@ -121,6 +181,7 @@ constexpr const char* ToString(DataType t) {
   return "Unknown data type";
 }
 
+// Helps getting a human readable typed test parameter name.
 template <class T>
 struct ParamName;
 
@@ -133,6 +194,34 @@ struct ParamName<TestParam<T, Ts...>> {
   }
 };
 
+template <DataType T, DataType... Ts>
+struct ParamName<PerTensor<TestParam<T, Ts...>>> {
+  static std::string Get() {
+    std::string name = std::string("PerTensor[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + "]";
+  }
+};
+
+template <DataType T, DataType... Ts, Axis axis>
+struct ParamName<PerAxis<TestParam<T, Ts...>, axis>> {
+  static std::string Get() {
+    std::string name = std::string("PerAxis[") + ToString(T);
+    ((name += std::string("_") + ToString(Ts)), ...);
+    return name + ":" + std::to_string(axis) + "]";
+  }
+};
+
+template <class TestParamT, class... TestParamTs>
+struct ParamName<std::tuple<TestParamT, TestParamTs...>> {
+  static std::string Get() {
+    std::string name = ParamName<TestParamT>::Get();
+    ((name += std::string(":") + ParamName<TestParamTs>::Get()), ...);
+    return name;
+  }
+};
+
+// Allows GTest to print a human readable version of the typed test parameters.
 class TestParamNames {
  public:
   template <class T>
@@ -141,32 +230,209 @@ class TestParamNames {
   }
 };
 
+// Applies the F template to the given testing::Types list.
+template <template <class> class F, class T>
+struct Map;
+
+template <template <class> class F, class... Ts>
+struct Map<F, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<F<Ts>...>;
+};
+
+template <template <class> class F, class T>
+using MapTypes = typename Map<F, T>::Types;
+
+// Concatenates testing::Types lists.
+template <class... Ts>
+struct Concat;
+
+template <class... Ts>
+struct Concat<::testing::Types<Ts...>> {
+  using Types = ::testing::Types<Ts...>;
+};
+
+template <class... Ts, class... Us, class... ExtraTypes>
+struct Concat<::testing::Types<Ts...>, ::testing::Types<Us...>, ExtraTypes...> {
+  using Types =
+      typename Concat<::testing::Types<Ts..., Us...>, ExtraTypes...>::Types;
+};
+
+template <class... Ts>
+using ConcatTypes = typename Concat<Ts...>::Types;
+
+// Transforms a list of types into a list of tuple<Op, type>.
+template <class Op, class T>
+struct WithOp;
+
+template <class Op, class... Ts>
+struct WithOp<Op, ::testing::Types<Ts...>> {
+  using Types = ::testing::Types<std::tuple<Op, Ts>...>;
+};
+
+template <class Op, class T>
+using WithOpTypes = typename WithOp<Op, T>::Types;
+
+// Helps generating a cross-product of lists.
+template <class Accu, class... Lists>
+struct CrossProductImpl;
+
+template <class... AccuTs, class... Ts, class... Lists>
+struct CrossProductImpl<::testing::Types<AccuTs...>, ::testing::Types<Ts...>,
+                        Lists...> {
+  using Types =
+      ConcatTypes<typename CrossProductImpl<::testing::Types<AccuTs..., Ts>,
+                                            Lists...>::Types...>;
+};
+
+template <class... AccuTs>
+struct CrossProductImpl<::testing::Types<AccuTs...>> {
+  using Types = ::testing::Types<::testing::Types<AccuTs...>>;
+};
+
+// Generates a cross-product of lists.
+template <class... Lists>
+struct CrossProduct {
+  using Types = typename CrossProductImpl<::testing::Types<>, Lists...>::Types;
+};
+
+template <class... Lists>
+using CrossProductTypes = typename CrossProduct<Lists...>::Types;
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int, float>,
+                          ::testing::Types<char, double>>,
+        ::testing::Types<
+            ::testing::Types<int, char>, ::testing::Types<int, double>,
+            ::testing::Types<float, char>, ::testing::Types<float, double>>>);
+
+static_assert(
+    std::is_same_v<
+        CrossProductTypes<::testing::Types<int>, ::testing::Types<char, double>,
+                          ::testing::Types<float>>,
+        ::testing::Types<::testing::Types<int, char, float>,
+                         ::testing::Types<int, double, float>>>);
+
+// Filters out the types that don't satisfy the predicate.
+template <template <class...> class Predicate, class List>
+struct Filter;
+
+template <template <class...> class Predicate, class... Ts>
+struct Filter<Predicate, ::testing::Types<Ts...>> {
+  using Type =
+      ConcatTypes<std::conditional_t<Predicate<Ts>::value, ::testing::Types<Ts>,
+                                     ::testing::Types<>>...>;
+};
+
+template <template <class...> class Predicate, class List>
+using FilterTypes = typename Filter<Predicate, List>::Type;
+
+static_assert(std::is_same_v<
+              FilterTypes<std::is_integral, ::testing::Types<int, char, float>>,
+              ::testing::Types<int, char>>);
+
+// Checks if all given types are the same.
+template <class T, class... Ts>
+struct SameTypes : std::bool_constant<(std::is_same_v<T, Ts> && ...)> {};
+
+// Checks if all types in the testing::Types list are the same.
+template <class T, class... Ts>
+struct SameTypes<::testing::Types<T, Ts...>> : SameTypes<T, Ts...> {};
+
+// Provides a new predicate that negates the given one.
+template <template <class...> class Pred>
+struct NegatePred {
+  template <class... Ts>
+  using Predicate = std::negation<Pred<Ts...>>;
+};
+
+// Use this with TYPED_TEST_SUITE for boolean testing.
+using BoolTestType = ::testing::Types<TestParam<DataType::kI1>>;
+
 // Use this with TYPED_TEST_SUITE for non quantized integer testing.
-using NonQuantizedIntTestTypes =
-    testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
-                   TestParam<DataType::kSI16>, TestParam<DataType::kSI32>>;
+using IntTestTypes =
+    ::testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
+                     TestParam<DataType::kSI16>, TestParam<DataType::kSI32>>;
 
 // Use this with TYPED_TEST_SUITE for non quantized floating point testing.
-using NonQuantizedFloatTestTypes =
-    testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
-                   TestParam<DataType::kF32>>;
+using FloatTestTypes =
+    ::testing::Types<TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
+                     TestParam<DataType::kF32>>;
 
 // Use this with TYPED_TEST_SUITE for non quantized testing.
-using NonQuantizedTestTypes =
-    testing::Types<TestParam<DataType::kSI4>, TestParam<DataType::kSI8>,
-                   TestParam<DataType::kSI16>, TestParam<DataType::kSI32>,
-                   TestParam<DataType::kBF16>, TestParam<DataType::kF16>,
-                   TestParam<DataType::kF32>>;
+using ArithmeticTestTypes = ConcatTypes<IntTestTypes, FloatTestTypes>;
 
-// Use this with TYPED_TEST_SUITE for quantized testing.
+// Use this with TYPED_TEST_SUITE for unspecified quantized testing.
 using QuantizedTestTypes =
-    testing::Types<TestParam<DataType::kSI4, DataType::kF32>,
-                   TestParam<DataType::kSI8, DataType::kF32>,
-                   TestParam<DataType::kSI16, DataType::kF32>,
-                   TestParam<DataType::kSI4, DataType::kBF16>,
-                   TestParam<DataType::kSI8, DataType::kBF16>,
-                   TestParam<DataType::kSI4, DataType::kF16>,
-                   TestParam<DataType::kSI8, DataType::kF16>>;
+    ::testing::Types<TestParam<DataType::kSI4, DataType::kF32>,
+                     TestParam<DataType::kSI8, DataType::kF32>,
+                     TestParam<DataType::kSI16, DataType::kF32>,
+                     TestParam<DataType::kSI4, DataType::kBF16>,
+                     TestParam<DataType::kSI8, DataType::kBF16>,
+                     TestParam<DataType::kSI4, DataType::kF16>,
+                     TestParam<DataType::kSI8, DataType::kF16>>;
+
+// Use this with TYPED_TEST_SUITE for quantized per tensor testing.
+using PerTensorQuantizedTestTypes = MapTypes<PerTensor, QuantizedTestTypes>;
+
+template <class T>
+using PerAxis0 = PerAxis<T, 0>;
+
+// Use this with TYPED_TEST_SUITE for quantized per axis testing.
+using PerAxisQuantizedTestTypes = MapTypes<PerAxis0, QuantizedTestTypes>;
+
+// Customization point for generic tests that need to create a supported tensor
+// for an op but that don't care what that type is.
+//
+// Specialize this in the test file if F32 isn't supported by the op under test.
+template <class Op>
+struct SupportedOpDataType {
+  static constexpr DataType kStorageType = DataType::kF32;
+};
+
+// Builds a TensorType object and returns it in a variant that can be passed to
+// a tensor.
+template <DataType storage_type>
+TensorTypeVariant TensorTypeFor(TestParam<storage_type>, const Shape& shape) {
+  return TensorType{.shape = shape, .element_type = storage_type};
+}
+
+// Builds a per tensor QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: the scale and zero point are randomly generated:
+//   - scale is in [0.5, 1.5]
+//   - zero_point is in [-5, 5]
+template <DataType storage_type, DataType expressed_type>
+TensorTypeVariant TensorTypeFor(
+    PerTensor<TestParam<storage_type, expressed_type>>, const Shape& shape) {
+  std::random_device rd;
+  UniformDistribution<expressed_type> expressed_dist(0.5, 1.5);
+  UniformDistribution<storage_type> storage_dist(-5, 5);
+  StorageType<expressed_type> scale =
+      static_cast<StorageType<expressed_type>>(expressed_dist(rd));
+  StorageType<storage_type> zero_point = storage_dist(rd);
+  return QuantizedTensorType{
+      .shape = shape,
+      .element_type =
+          QuantizedTensorElementType::PerTensor<storage_type, expressed_type>(
+              scale, zero_point)};
+}
+
+// Builds a per axis QuantizedTensorType object and returns it in a variant
+// that can be passed to a tensor.
+//
+// WARNING: scales and zero points are unspecified and may be empty.
+template <DataType storage_type, DataType expressed_type, Axis axis>
+TensorTypeVariant TensorTypeFor(
+    PerAxis<TestParam<storage_type, expressed_type>, axis>,
+    const Shape& shape) {
+  return QuantizedTensorType{
+      .shape = shape,
+      .element_type =
+          QuantizedTensorElementType::PerAxis<storage_type, expressed_type>(
+              /*scales=*/{}, /*zero_points=*/{}, axis)};
+}
 
 }  // namespace shlo_ref
 
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
index 89217c1306083e..554c1096f845e0 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise.h
@@ -34,7 +34,7 @@ namespace shlo_ref {
 namespace detail {
 
 template <typename StorageT, typename ExpressedT, typename F>
-void DequantizeOpQuantizePerChannelImpl(
+void DequantizeOpQuantizePerAxisImpl(
     F& op, const Shape& shape, const Axis quantization_dimension,
     const StorageT quantization_min, const StorageT quantization_max,
     const absl::Span<const StorageT> input_zero_points,
@@ -65,7 +65,7 @@ void DequantizeOpQuantizePerChannelImpl(
       if (depth == quantization_dimension) {
         quantization_index = i;
       }
-      DequantizeOpQuantizePerChannelImpl(
+      DequantizeOpQuantizePerAxisImpl(
           op, shape, quantization_dimension, quantization_min, quantization_max,
           input_zero_points, input_scales, output_zero_points, output_scales,
           strides, input_data, output_data, depth + 1, quantization_index);
@@ -76,8 +76,8 @@ void DequantizeOpQuantizePerChannelImpl(
 }
 
 template <DataType storage_type, DataType expressed_type, typename F>
-void DequantizeOpQuantizePerChannel(F&& func, const Tensor& input,
-                                    Tensor& output) {
+void DequantizeOpQuantizePerAxis(F&& func, const Tensor& input,
+                                 Tensor& output) {
   using StorageT = StorageType<storage_type>;
   using ExpressedT = StorageType<expressed_type>;
   const Shape& shape = input.shape();
@@ -94,7 +94,7 @@ void DequantizeOpQuantizePerChannel(F&& func, const Tensor& input,
   const Strides& strides = ComputeStrides(shape);
   const StorageT* input_data = input.GetDataAs<storage_type>();
   StorageT* output_data = output.GetDataAs<storage_type>();
-  DequantizeOpQuantizePerChannelImpl(
+  DequantizeOpQuantizePerAxisImpl(
       func, shape, quantization_dimension, Storage<storage_type>::kMinValue,
       Storage<storage_type>::kMaxValue, input_zero_points, input_scales,
       output_zero_points, output_scales, strides, input_data, output_data,
@@ -169,7 +169,7 @@ template <class F>
 absl::Status Evaluate(UnaryElementwiseOp<F>& op, const Tensor& input,
                       Tensor& output) {
   if (input.IsPerAxisQuantized()) {
-    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerChannel,
+    DISPATCH_QUANTIZED(detail::DequantizeOpQuantizePerAxis,
                        input.quantized_tensor_element_type().StorageType(),
                        input.quantized_tensor_element_type().ExpressedType(),
                        op.func, input, output);
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
index cd81c4dbee48f4..5dc5e51fd3de18 100644
--- a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test.cc
@@ -51,7 +51,7 @@ struct TestParam {
 template <class T>
 struct UnaryElementWiseTest : ::testing::Test {};
 
-TYPED_TEST_SUITE(UnaryElementWiseTest, NonQuantizedTestTypes);
+TYPED_TEST_SUITE(UnaryElementWiseTest, ArithmeticTestTypes);
 
 TYPED_TEST(UnaryElementWiseTest, NonQuantizedWithAbs) {
   using StorageT = typename TypeParam::StorageT;
diff --git a/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
new file mode 100644
index 00000000000000..2f5fab7a712ea2
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/unary_elementwise_test_util.h
@@ -0,0 +1,262 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
+
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <class Op>
+using BaselineMismatchSignedIntegerTypes = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI8>>,
+    std::tuple<Op, TestParam<DataType::kSI4>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI8>, TestParam<DataType::kSI16>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI4>>,
+    std::tuple<Op, TestParam<DataType::kSI16>, TestParam<DataType::kSI8>>>;
+
+// Lists couples of unmatched baseline element types.
+template <class Op>
+using UnaryElementwiseConstraint1Types = ::testing::Types<
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, TestParam<DataType::kF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kBF16>, TestParam<DataType::kF32>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kF16>>,
+    std::tuple<Op, TestParam<DataType::kF32>, TestParam<DataType::kBF16>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI4, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI8, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI16, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI4, DataType::kF32>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kBF16>>>,
+    std::tuple<Op, PerTensor<TestParam<DataType::kSI16, DataType::kF32>>,
+               PerTensor<TestParam<DataType::kSI8, DataType::kF32>>>>;
+
+// Tests that the input shape is compared to the output shape and that it is
+// propagated if needed.
+
+template <class Op>
+class UnaryElementwiseOpShapePropagationTest : public ::testing::Test {
+ protected:
+  void SetOutputShape(Shape shape) {
+    output_tensor_.shape() = std::move(shape);
+  }
+  bool InputAndOutputShapesAreEqual() const {
+    return input_tensor_.shape() == output_tensor_.shape();
+  }
+
+  Op op_ = Create(typename Op::Attributes{});
+  Tensor input_tensor_ = {
+      .type = TensorType{.shape = Shape({2, 3, 4}),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+  Tensor output_tensor_ = {
+      .type = TensorType{.shape = Shape(),
+                         .element_type = SupportedOpDataType<Op>::kStorageType},
+      .data = nullptr};
+};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest);
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest, ShapePropagationWorks) {
+  ASSERT_TRUE(this->output_tensor_.shape().empty());
+  EXPECT_OK(Prepare(this->op_, this->input_tensor_, this->output_tensor_));
+  EXPECT_THAT(this->output_tensor_.shape(),
+              ::testing::ElementsAreArray(this->input_tensor_.shape()));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             SmallerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             BiggerOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 4, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+TYPED_TEST_P(UnaryElementwiseOpShapePropagationTest,
+             IncompatibleOutputShapeRaisesAnError) {
+  this->SetOutputShape(Shape({2, 3, 5}));
+  ASSERT_FALSE(this->InputAndOutputShapesAreEqual());
+  EXPECT_EQ(
+      Prepare(this->op_, this->input_tensor_, this->output_tensor_),
+      absl::FailedPreconditionError("The specified output tensor shape is not "
+                                    "compatible with the input shape."));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseOpShapePropagationTest,
+                            ShapePropagationWorks,
+                            SmallerOutputShapeRaisesAnError,
+                            BiggerOutputShapeRaisesAnError,
+                            IncompatibleOutputShapeRaisesAnError);
+
+// Tests that the baseline element type of the input and output tensors is the
+// same.
+template <class T>
+class UnaryElementwiseSameBaselineElementTypeConstraintTest
+    : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseSameBaselineElementTypeConstraintTest);
+
+TYPED_TEST_P(UnaryElementwiseSameBaselineElementTypeConstraintTest,
+             DifferentInputOutputStorageTypesRaiseAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using OperandTypeDesc = std::tuple_element_t<1, TypeParam>;
+  using ResultTypeDesc = std::tuple_element_t<2, TypeParam>;
+  const Shape shape({2, 3, 4});
+  Tensor input_tensor{.type = TensorTypeFor(OperandTypeDesc{}, shape),
+                      .data = nullptr};
+  Tensor output_tensor{.type = TensorTypeFor(ResultTypeDesc{}, shape),
+                       .data = nullptr};
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(
+      status.message(),
+      ::testing::ContainsRegex(
+          "stablehlo.[_a-z]+: baseline type constraint is not satisfied"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(
+    UnaryElementwiseSameBaselineElementTypeConstraintTest,
+    DifferentInputOutputStorageTypesRaiseAnError);
+
+// Tests that unsupported types are detected during when `Prepare` is called.
+template <class T>
+class UnaryElementwiseUnsupportedTypeTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest);
+
+TYPED_TEST_P(UnaryElementwiseUnsupportedTypeTest, PrepareRaisesAnError) {
+  using Op = std::tuple_element_t<0, TypeParam>;
+  using TypeDesc = std::tuple_element_t<1, TypeParam>;
+  Tensor input_tensor{.type = TensorTypeFor(TypeDesc{}, Shape({2, 3, 4})),
+                      .data = nullptr};
+  Tensor output_tensor = input_tensor;
+  auto op = Create(typename Op::Attributes{});
+  const absl::Status status = Prepare(op, input_tensor, output_tensor);
+  EXPECT_THAT(status, shlo_ref::testing::StatusIs(
+                          absl::StatusCode::kFailedPrecondition));
+  EXPECT_THAT(status.message(),
+              ::testing::HasSubstr("Unsupported tensor type"));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(UnaryElementwiseUnsupportedTypeTest,
+                            PrepareRaisesAnError);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UNARY_ELEMENTWISE_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/util.cc b/tensorflow/lite/experimental/shlo/ops/util.cc
index 4371e9f7edd5ef..6698649dae6332 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.cc
+++ b/tensorflow/lite/experimental/shlo/ops/util.cc
@@ -15,19 +15,73 @@ limitations under the License.
 #include "tensorflow/lite/experimental/shlo/ops/util.h"
 
 #include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
 absl::Status Propagate(const Shape& input_shape, Shape& output_shape) {
   if (output_shape.Dimensions().empty()) {
     output_shape = input_shape;
-  } else {
-    if (output_shape != input_shape) {
-      return absl::FailedPreconditionError(
-          "The specified output tensor shape is not compatible with the input "
-          "shape.");
-    }
+  } else if (output_shape != input_shape) {
+    return absl::FailedPreconditionError(
+        "The specified output tensor shape is not compatible with the input "
+        "shape.");
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Propagate(const Shape& lhs_shape, const Shape& rhs_shape,
+                       Shape& output_shape) {
+  if (lhs_shape != rhs_shape) {
+    return absl::FailedPreconditionError(
+        "The LHS and RHS shapes are incompatible.");
+  } else if (output_shape.Dimensions().empty()) {
+    output_shape = lhs_shape;
+  } else if (output_shape != lhs_shape) {
+    return absl::FailedPreconditionError(
+        "The specified output tensor shape is not compatible with the input "
+        "shapes.");
+  }
+  return absl::OkStatus();
+}
+
+bool IsBoolTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsBool(tensor.StorageType());
+}
+
+bool IsSignedIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsSignedInteger(tensor.StorageType());
+}
+
+bool IsUnsignedIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsUnsignedInteger(tensor.StorageType());
+}
+
+bool IsIntTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsInteger(tensor.StorageType());
+}
+
+bool IsFloatTensor(const Tensor& tensor) {
+  return !tensor.IsQuantized() && IsFloat(tensor.StorageType());
+}
+
+bool IsQuantizedPerTensorTensor(const Tensor& tensor) {
+  return tensor.IsPerTensorQuantized();
+}
+
+bool IsQuantizedPerAxisTensor(const Tensor& tensor) {
+  return tensor.IsPerAxisQuantized();
+}
+
+absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
+                                   const Tensor& tensor2) {
+  if (BaselineType(tensor1.element_type()) !=
+      BaselineType(tensor2.element_type())) {
+    return absl::FailedPreconditionError(
+        "stablehlo." + ctx.op_name +
+        ": baseline type constraint is not satisfied.");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/experimental/shlo/ops/util.h b/tensorflow/lite/experimental/shlo/ops/util.h
index d13238db144264..ccb858c6194439 100644
--- a/tensorflow/lite/experimental/shlo/ops/util.h
+++ b/tensorflow/lite/experimental/shlo/ops/util.h
@@ -15,8 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
 
+#include <string>
+
 #include "absl/status/status.h"
 #include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
 
 namespace shlo_ref {
 
@@ -25,12 +28,71 @@ namespace shlo_ref {
     return s;                             \
   }
 
-// Propages the input shape to the output shape.
+// Propagates the input shape to the output shape.
 //
 // If the output shape is already populated, checks that is it compatible with
 // the input.
 absl::Status Propagate(const Shape& input_shape, Shape& output_shape);
 
+// Propagates the input shapes to the output shape.
+//
+// If the output shape is already populated, checks that is it compatible with
+// the inputs.
+absl::Status Propagate(const Shape& lhs_shape, const Shape& rhs_shape,
+                       Shape& output_shape);
+
+// Provides context information for the `Check*` functions error messages.
+struct CheckCtx {
+  explicit CheckCtx(std::string name) : op_name(name) {}
+  // The operation that requested the check.
+  std::string op_name;
+};
+
+// Checks that the `tensor` element type is supported by one the the `checks`
+// functions.
+//
+// Returns a failed precondition error when no check succeeds.
+//
+// The check functions should have the following signature.
+//
+// ```
+// bool Check(const Tensor& tensor);
+// ```
+template <class... CheckFuncs>
+absl::Status CheckSupportedTypes(CheckCtx ctx, const Tensor& tensor,
+                                 CheckFuncs&&... checks) {
+  if ((static_cast<CheckFuncs&&>(checks)(tensor) || ...)) {
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError("stablehlo." + ctx.op_name +
+                                       ": Unsupported tensor type.");
+}
+
+// Returns true if the tensor's storage type is boolean.
+bool IsBoolTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is a signed integer type.
+bool IsSignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an unsigned integer type.
+bool IsUnsignedIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an integer type.
+bool IsIntTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is an floating point type.
+bool IsFloatTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per tensor.
+bool IsQuantizedPerTensorTensor(const Tensor& tensor);
+
+// Returns true if the tensor's storage type is quantized per axis.
+bool IsQuantizedPerAxisTensor(const Tensor& tensor);
+
+// Checks that both tensors have the same baseline element type.
+absl::Status CheckSameBaselineType(CheckCtx ctx, const Tensor& tensor1,
+                                   const Tensor& tensor2);
+
 }  // namespace shlo_ref
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/xor.cc b/tensorflow/lite/experimental/shlo/ops/xor.cc
new file mode 100644
index 00000000000000..6b17220e2a49ff
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor.cc
@@ -0,0 +1,68 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/xor.h"
+
+#include <functional>
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise.h"
+#include "tensorflow/lite/experimental/shlo/ops/util.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+template <DataType>
+struct Xor : std::bit_xor<void> {};
+
+template <>
+struct Xor<DataType::kI1> {
+  template <class T>
+  bool operator()(T lhs, T rhs) const {
+    return static_cast<bool>(lhs) != static_cast<bool>(rhs);
+  }
+};
+
+XorOp Create(XorOp::Attributes) { return {}; }
+
+absl::Status Prepare(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output) {
+  SHLO_REF_RETURN_ON_ERROR(Propagate(lhs.shape(), rhs.shape(), output.shape()));
+  SHLO_REF_RETURN_ON_ERROR(
+      CheckSupportedTypes(CheckCtx("xor"), lhs, IsBoolTensor, IsIntTensor));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("xor"), lhs, output));
+  SHLO_REF_RETURN_ON_ERROR(CheckSameBaselineType(CheckCtx("xor"), rhs, output));
+  return absl::OkStatus();
+}
+
+absl::Status Evaluate(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output) {
+  if (IsIntTensor(lhs)) {
+    // Note: all the integer types share the same implementation.
+    Xor<DataType::kSI32> xor_func;
+    DISPATCH_INT(detail::EvaluateNoQuantization, lhs.tensor_element_type(),
+                 xor_func, lhs, rhs, output);
+  } else if (IsBoolTensor(lhs)) {
+    Xor<DataType::kI1> xor_func;
+    detail::EvaluateNoQuantization<DataType::kI1>(xor_func, lhs, rhs, output);
+    return absl::OkStatus();
+  }
+  return absl::FailedPreconditionError(
+      "stablehlo.xor: Unsupported tensor type in Evaluate.");
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/ops/xor.h b/tensorflow/lite/experimental/shlo/ops/xor.h
new file mode 100644
index 00000000000000..0a98aca65c24ca
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
+
+#include "absl/status/status.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+namespace shlo_ref {
+
+struct XorOp {
+  struct Attributes {};
+};
+
+XorOp Create(XorOp::Attributes);
+absl::Status Prepare(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                     Tensor& output);
+absl::Status Evaluate(XorOp& op, const Tensor& lhs, const Tensor& rhs,
+                      Tensor& output);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_OPS_XOR_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/xor_test.cc b/tensorflow/lite/experimental/shlo/ops/xor_test.cc
new file mode 100644
index 00000000000000..591b185327aed6
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/xor_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions xor
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/ops/xor.h"
+
+#include <functional>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/ops/binary_elementwise_test_util.h"
+#include "tensorflow/lite/experimental/shlo/ops/test_util.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+#include "tensorflow/lite/experimental/shlo/status_matcher.h"
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+using testing::FloatEq;
+using testing::Pointwise;
+
+namespace shlo_ref {
+
+template <>
+struct ParamName<XorOp> {
+  static std::string Get() { return "Xor"; }
+};
+
+template <DataType>
+struct Xor : std::bit_xor<void> {};
+
+template <>
+struct Xor<DataType::kI1> {
+  template <class T>
+  bool operator()(T lhs, T rhs) const {
+    return static_cast<bool>(lhs) != static_cast<bool>(rhs);
+  }
+};
+
+template <>
+struct SupportedOpDataType<XorOp> {
+  static constexpr DataType kStorageType = DataType::kSI32;
+};
+
+namespace {
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Xor, BinaryElementwiseOpShapePropagationTest,
+                               XorOp, TestParamNames);
+
+using MultipyBaselineContraintTypes = BinaryElementwiseBaselineConstraintTypes<
+    XorOp, ConcatTypes<BoolTestType, BaselineConstraintIntTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(
+    Xor, BinaryElementwiseSameBaselineElementTypeConstraintTest,
+    MultipyBaselineContraintTypes, TestParamNames);
+
+using UnsupportedTypes =
+    WithOpTypes<XorOp, ConcatTypes<FloatTestTypes, PerTensorQuantizedTestTypes,
+                                   PerAxisQuantizedTestTypes>>;
+
+INSTANTIATE_TYPED_TEST_SUITE_P(Xor, BinaryElementwiseUnsupportedTypeTest,
+                               UnsupportedTypes, TestParamNames);
+
+using SupportedTypes = ConcatTypes<BoolTestType, IntTestTypes>;
+
+template <class T>
+struct XorTest : ::testing::Test {};
+
+TYPED_TEST_SUITE(XorTest, SupportedTypes, TestParamNames);
+
+TYPED_TEST(XorTest, ArithmeticTestTypesTensorsWork) {
+  using StorageT = typename TypeParam::StorageT;
+
+  const Shape shape({2, 3, 4});
+  Vector<StorageT> lhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/-50, /*max=*/50);
+  Vector<StorageT> rhs_data =
+      RandomBuffer<TypeParam::kStorage>(shape, /*min=*/1, /*max=*/5);
+  Vector<StorageT> output_data(shape.NumElements());
+  Tensor lhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = lhs_data.data()};
+  Tensor rhs_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = rhs_data.data()};
+  Tensor output_tensor{
+      .type = TensorType{.shape = shape, .element_type = TypeParam::kStorage},
+      .data = output_data.data()};
+
+  Vector<StorageT> expected_data(shape.NumElements());
+  absl::c_transform(lhs_data, rhs_data, expected_data.begin(),
+                    Xor<TypeParam::kStorage>());
+
+  auto op = Create(XorOp::Attributes{});
+  ASSERT_OK(Prepare(op, lhs_tensor, rhs_tensor, output_tensor));
+  ASSERT_OK(Evaluate(op, lhs_tensor, rhs_tensor, output_tensor));
+  EXPECT_THAT(output_data, Pointwise(FloatEq(), expected_data));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/shape.h b/tensorflow/lite/experimental/shlo/shape.h
index 262c7b0bc1e901..72a322299972d1 100644
--- a/tensorflow/lite/experimental/shlo/shape.h
+++ b/tensorflow/lite/experimental/shlo/shape.h
@@ -66,6 +66,24 @@ class Shape {
   // and possible confusion with C++ container's usage of size().
   DimensionSize NumElements() const;
 
+  // The following members are provided for compatibility with the standard
+  // library.
+  using value_type = DimensionSize;
+
+  const value_type& operator[](int dim) const { return dims_[dim]; }
+  value_type& operator[](int dim) { return dims_[dim]; }
+
+  auto cbegin() const { return dims_.begin(); }
+  auto begin() const { return dims_.begin(); }
+  auto begin() { return dims_.begin(); }
+  auto cend() const { return dims_.end(); }
+  auto end() const { return dims_.end(); }
+  auto end() { return dims_.end(); }
+  bool empty() const { return dims_.empty(); }
+  size_t size() const { return dims_.size(); }
+  const value_type* data() const { return dims_.data(); }
+  value_type* data() { return dims_.data(); }
+
  private:
   absl::InlinedVector<DimensionSize, kMaxNumDimensions> dims_;
 };
diff --git a/tensorflow/lite/experimental/shlo/status_matcher.h b/tensorflow/lite/experimental/shlo/status_matcher.h
index f79e732c14a750..77f4fad9ad2706 100644
--- a/tensorflow/lite/experimental/shlo/status_matcher.h
+++ b/tensorflow/lite/experimental/shlo/status_matcher.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_MACROS_H_
 
+// IWYU pragma: always_keep
+
 #include <gmock/gmock.h>
-#include "absl/status/status.h"
+#include "absl/status/status.h"  // IWYU pragma: keep - used in the
+                                             // provided macros in OSS builds.
 
 namespace shlo_ref {
 namespace testing {
diff --git a/tensorflow/lite/experimental/shlo/tensor.h b/tensorflow/lite/experimental/shlo/tensor.h
index 6904ad92db4689..57029105d1a218 100644
--- a/tensorflow/lite/experimental/shlo/tensor.h
+++ b/tensorflow/lite/experimental/shlo/tensor.h
@@ -33,8 +33,10 @@ constexpr TensorElementType BaselineType(TensorElementType type) {
   return type;
 }
 
-std::variant<TensorElementType, QuantizedTensorElementType> BaselineType(
-    const std::variant<TensorElementType, QuantizedTensorElementType>& type);
+using TensorElementTypeVariant =
+    std::variant<TensorElementType, QuantizedTensorElementType>;
+
+TensorElementTypeVariant BaselineType(const TensorElementTypeVariant& type);
 
 struct TensorType {
   Shape shape;
@@ -46,6 +48,8 @@ struct QuantizedTensorType {
   QuantizedTensorElementType element_type;
 };
 
+using TensorTypeVariant = std::variant<TensorType, QuantizedTensorType>;
+
 struct Tensor {
   const Shape& shape() const;
   Shape& shape();
@@ -69,8 +73,7 @@ struct Tensor {
   const TensorElementType& tensor_element_type() const;
   const QuantizedTensorElementType& quantized_tensor_element_type() const;
 
-  std::variant<TensorElementType, QuantizedTensorElementType> element_type()
-      const;
+  TensorElementTypeVariant element_type() const;
 
   template <DataType data_type, typename T = typename Storage<data_type>::Type>
   T* GetDataAs() {
@@ -88,7 +91,7 @@ struct Tensor {
                                static_cast<size_t>(NumElements()));
   }
 
-  std::variant<TensorType, QuantizedTensorType> type;
+  TensorTypeVariant type;
 
   // If type is TensorType, the type should be Storage<type.element_type>::Type.
   // If type is QuantizedTensorType, the type should be
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index b9e0ab86649537..aea79a35d14356 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -137,7 +137,7 @@ in the `.tf_configure.bazelrc` file in the root folder:
 
 ```shell
 build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r25b"
-build --action_env ANDROID_NDK_API_LEVEL="26"
+build --action_env ANDROID_NDK_API_LEVEL="21"
 build --action_env ANDROID_BUILD_TOOLS_VERSION="30.0.3"
 build --action_env ANDROID_SDK_API_LEVEL="30"
 build --action_env ANDROID_SDK_HOME="/usr/local/android/android-sdk-linux"
diff --git a/tensorflow/lite/g3doc/guide/ops_custom.md b/tensorflow/lite/g3doc/guide/ops_custom.md
index 296fd6216ac397..4f7089aca4ec3e 100644
--- a/tensorflow/lite/g3doc/guide/ops_custom.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -125,113 +125,374 @@ Encountered unresolved custom op: Atan.
 
 ### Create and register the operator.
 
-All TensorFlow Lite operators (both custom and builtin) are defined using a
-simple pure-C interface that consists of four functions:
+```c++
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/c_api_opaque.h"
+```
+
+TensorFlow Lite custom operators are defined using a simple pure-C API that
+consists of an opaque type (`TfLiteRegistrationExternal`) and related functions.
+
+`TfLiteRegistrationExternal` is an opaque type:
+
+```c++
+typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+```
+
+`TfLiteRegistrationExternal` stores the operator's identity and implementation.
+(Note that the operator is distinct from its operands, which are stored in the
+TF Lite graph nodes for nodes that call the operator.)
+
+Instances of this type are constructed with calls to
+`TfLiteRegistrationExternalCreate` and can be destroyed by calling
+`TfLiteRegistrationExternalDelete`.
+
+The operator's identity is set via the parameters to the constructor function
+`TfLiteRegistrationExternalCreate`:
+
+```c++
+TfLiteRegistrationExternal*
+TfLiteRegistrationExternalCreate(
+    TfLiteBuiltinOperator builtin_code,  // Normally `TfLiteBuiltinCustom`.
+    const char* custom_name,  // The name of the custom op.
+    int version  // Normally `1` for the first version of a custom op.
+);
+```
+
+The operator implementation can define "methods" with the following signatures.
+All of these methods are optional, but for an operator to be successfully
+evaluated, the operator implementation needs to define and set (using the setter
+functions) at least the `Prepare` and `Invoke` methods.
 
 ```c++
-typedef struct {
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-  void (*free)(TfLiteContext* context, void* buffer);
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-} TfLiteRegistration;
+// Initializes the op from serialized data.
+void* Init(TfLiteOpaqueContext* context, const char* buffer, size_t length);
+
+// Deallocates the op.
+// The pointer `buffer` is the data previously returned by an Init invocation.
+void Free(TfLiteOpaqueContext* context, void* buffer);
+
+// Called when the inputs that this node depends on have been resized.
+TfLiteStatus Prepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+// Called when the node is executed. (Should read node inputs and write to
+// node outputs).
+TfLiteStatus Invoke(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
+
+// Retrieves the async kernel.
+TfLiteAsyncKernel AsyncKernel(TfLiteOpaqueContext* context,
+                              TfLiteOpaqueNode* node);
+```
+
+The function *names* (or namespace prefixes, for C++) in your op implementation
+don't have to match the function names in the above code snippet, since the TF
+Lite custom ops API will only use their addresses. Indeed we recommend that you
+declare them in an anonymous namespace or as static functions.
+
+But it is a good idea to include your operator name as a namespace or prefix on
+these function names:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace my_namespace::my_custom_op {
+  void* Init(TfLiteOpaqueContext* context,
+             const char* buffer, size_t length) { ... }
+  // ... plus definitions of Free, Prepare, and Invoke ...
+}
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+void* MyCustomOpInit(TfLiteOpaqueContext* context,
+                     const char* buffer, size_t length) { ... }
+// ... plus definitions of MyCustomOpFree, MyCustomOpPrepare, and
+// MyCustomOpInvoke.
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+Since this is a C API, these "methods" are implemented as C function pointers in
+the `TfLiteRegistrationExternal` type, which are set by passing the addresses of
+your implementation functions to the corresponding setter functions
+`TfLiteRegistrationExternalSet`*MethodName*:
+
+```c++
+void TfLiteRegistrationExternalSetInit(
+    TfLiteRegistrationExternal* registration,
+    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                  size_t length));
+void TfLiteRegistrationExternalSetFree(
+    TfLiteRegistrationExternal* registration,
+    void (*free)(TfLiteOpaqueContext* context, void* data));
+void TfLiteRegistrationExternalSetPrepare(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+void TfLiteRegistrationExternalSetInvoke(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
 ```
 
 Refer to
-[`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h)
-for details on `TfLiteContext` and `TfLiteNode`. The former provides error
+[`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/c/common.h)
+for details on `TfLiteContext` and `TfLiteNode`. `TfLiteContext` provides error
 reporting facilities and access to global objects, including all the tensors.
-The latter allows implementations to access their inputs and outputs.
-
-When the interpreter loads a model, it calls `init()` once for each node in the
-graph. A given `init()` will be called more than once if the op is used multiple
-times in the graph. For custom ops a configuration buffer will be provided,
-containing a flexbuffer that maps parameter names to their values. The buffer is
-empty for builtin ops because the interpreter has already parsed the op
-parameters. Kernel implementations that require state should initialize it here
-and transfer ownership to the caller. For each `init()` call, there will be a
-corresponding call to `free()`, allowing implementations to dispose of the
-buffer they might have allocated in `init()`.
+`TfLiteNode` allows operator implementations to access their inputs and outputs.
+
+When the interpreter loads a model, it calls the `Init()` method once for each
+node in the graph. A given `Init()` will be called more than once if the op is
+used multiple times in the graph. For custom ops a configuration buffer will be
+provided, containing a flexbuffer that maps parameter names to their values. The
+buffer is empty for builtin ops because the interpreter has already parsed the
+op parameters. Kernel implementations that require state should initialize it
+here and transfer ownership to the caller. For each `Init()` call, there will be
+a corresponding call to `Free()`, allowing implementations to dispose of the
+buffer they might have allocated in `Init()`.
 
 Whenever the input tensors are resized, the interpreter will go through the
 graph notifying implementations of the change. This gives them the chance to
 resize their internal buffer, check validity of input shapes and types, and
-recalculate output shapes. This is all done through `prepare()`, and
-implementations can access their state using `node->user_data`.
+recalculate output shapes. This is all done through the `Prepare()` method, and
+implementations can access their state using
+`TfLiteOpaqueNodeGetUserData(node)`.
 
 Finally, each time inference runs, the interpreter traverses the graph calling
-`invoke()`, and here too the state is available as `node->user_data`.
-
-Custom ops can be implemented in exactly the same way as builtin ops, by
-defining those four functions and a global registration function that usually
-looks like this:
+the `Invoke()` method, and here too the state is available as
+`TfLiteOpaqueNodeGetUserData(node)`.
+
+Custom ops can be implemented by defining those "method" functions, and then
+defining a function that returns an instance of `TfLiteRegistrationExternal`
+constructed by calling `TfLiteRegistrationExternalCreate` and then the relevant
+setter methods:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace my_namespace::my_custom_op {
+  namespace {
+    void* Init(TfLiteOpaqueContext* context,
+               const char* buffer, size_t length) { ... }
+    void Free(TfLiteOpaqueContext* context, void* buffer) { ... }
+    TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                         TfLiteOpaqueNode* node) { ... }
+    TfLiteStatus Invoke(TfLiteOpaqueContext* context,
+                        TfLiteOpaqueNode* node) {... }
+  };
+
+  const TfLiteRegistrationExternal* MyCustomOpRegistrationExternal() {
+    // Singleton instance, intentionally never destroyed.
+    static const TfLiteRegistrationExternal* my_custom_op = ()[] {
+        TfLiteRegistrationExternal* r =
+            TfLiteRegistrationExternalCreate(
+                kTfLiteBuiltinCustom, "MyCustomOp", /*version=*/ 1);
+        TfLiteRegistrationExternalSetInit(r, Init);
+        TfLiteRegistrationExternalSetFree(r, Free);
+        TfLiteRegistrationExternalSetPrepare(r, Prepare);
+        TfLiteRegistrationExternalSetInvoke(r, Eval);
+        return r;
+      };
+    return my_custom_op;
+  }
 
-```c++
-namespace my_namespace {
-  const TfLiteRegistration* Register_MY_CUSTOM_OP() {
-    static const TfLiteRegistration r = {my_custom_op::Init,
-                                         my_custom_op::Free,
-                                         my_custom_op::Prepare,
-                                         my_custom_op::Eval};
-    return &r;
+  const TfLiteRegistration* MyCustomOpRegistration() {
+    static const TfLiteRegistration my_custom_op {
+      .registration_external = MyCustomOpRegistrationExternal();
+    };
+    return my_custom_op;
   }
 }  // namespace my_namespace
-```
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+static void* MyCustomOpInit(TfLiteOpaqueContext* context, const char* buffer,
+                     size_t length) { ... }
+static void MyCustomOpFree(TfLiteOpaqueContext* context, void* buffer) { ... }
+static TfLiteStatus MyCustomOpPrepare(TfLiteOpaqueContext* context,
+                                      TfLiteOpaqueNode* node) { ... }
+static TfLiteStatus MyCustomOpInvoke(TfLiteOpaqueContext* context,
+                                     TfLiteOpaqueNode* node) {... }
+
+static TfLiteRegistrationExternal* MyCustomOpCreate() {
+  const TfLiteRegistrationExternal* r =
+      TfLiteRegistrationExternalCreate(
+          kTfLiteBuiltinCustom, "MyCustomOp", /*version=*/ 1);
+  TfLiteRegistrationExternalSetInit(r, MyCustomOpInit);
+  TfLiteRegistrationExternalSetFree(r, MyCustomOpFree);
+  TfLiteRegistrationExternalSetPrepare(r, MyCustomOpPrepare);
+  TfLiteRegistrationExternalSetInvoke(r, MyCustomOpEval);
+  return r;
+}
 
-Note that registration is not automatic and an explicit call to
-`Register_MY_CUSTOM_OP` should be made. While the standard `BuiltinOpResolver`
-(available from the `:builtin_ops` target) takes care of the registration of
-builtins, custom ops will have to be collected in separate custom libraries.
+const TfLiteRegistrationExternal* MyCustomOpRegistrationExternal() {
+  // Singleton instance, intentionally never destroyed.
+  static const TfLiteRegistrationExternal* my_custom_op = MyCustomOpCreate();
+  return my_custom_op;
+}
+
+const TfLiteRegistration MyCustomOpRegistration() {
+  static const TfLiteRegistration my_custom_op {
+    .registration_external = MyCustomOpRegistrationExternal();
+  };
+  return my_custom_op;
+}
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+Note that registration is not automatic and an explicit call to your
+`MyCustomOpRegistration` function should be made (see details below). While the
+standard `BuiltinOpResolver` (available from the `:builtin_ops` target) takes
+care of the registration of builtins, custom ops will have to be collected in
+separate custom libraries.
 
 ### Defining the kernel in the TensorFlow Lite runtime
 
 All we need to do to use the op in TensorFlow Lite is define two functions
-(`Prepare` and `Eval`), and construct a `TfLiteRegistration`:
+(`Prepare` and `Eval`), and a third to construct a `TfLiteRegistrationExternal`:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-cpp">
+namespace atan_op {
+  namespace {
+    TfLiteStatus AtanPrepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+      TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumInputs(node), 1);
+      TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumOutputs(node), 1);
+
+      const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+      TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
+
+      int num_dims = TfLiteOpaqueTensorNumDimensions(input);
+
+      TfLiteIntArray* output_size = TfLiteIntArrayCreate(num_dims);
+      for (int i=0; i < num_dims; ++i) {
+        output_size->data[i] = input->dims->data[i];
+      }
+
+      return TfLiteOpaqueContextResizeTensor(context, output, output_size);
+    }
+
+    TfLiteStatus AtanEval(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+      const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+      TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
+
+      float* input_data = static_cast<float*>(TfLiteOpaqueTensorData(input));
+      float* output_data = static_cast<float*>(TfLiteOpaqueTensorData(output));
+
+      size_t count = 1;
+      int num_dims = TfLiteOpaqueTensorNumDimensions(input);
+      for (int i = 0; i < num_dims; ++i) {
+        count *= input->dims->data[i];
+      }
+
+      for (size_t i = 0; i < count; ++i) {
+        output_data[i] = atan(input_data[i]);
+      }
+      return kTfLiteOk;
+    }
+  }  // anonymous namespace
+
+  const TfLiteRegistrationExternal* AtanOpRegistrationExternal() {
+    // Singleton instance, intentionally never destroyed.
+    static const TfLiteRegistrationExternal* atan_op = ()[] {
+        auto* r = TfLiteRegistrationExternalCreate(
+            kTfLiteBuiltinCustom, "ATAN", /*version=*/ 1);
+        TfLiteRegistrationExternalSetPrepare(r, Prepare);
+        TfLiteRegistrationExternalSetInvoke(r, Eval);
+        return r;
+      };
+    return atan_op;
+  }
 
-```cpp
-TfLiteStatus AtanPrepare(TfLiteContext* context, TfLiteNode* node) {
-  using namespace tflite;
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteRegistration AtanOpRegistration() {
+    static const TfLiteRegistration atan_op {
+      .registration_external = AtanOpRegistrationExternal();
+    };
+    return atan_op;
+  }
+}  // namespace atan_op
+      </pre></p>
+    </section>
+    <section>
+      <h3>C</h3>
+      <p><pre class="prettyprint lang-cpp">
+static TfLiteStatus AtanPrepare(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+  TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumInputs(node), 1);
+  TF_LITE_OPAQUE_ENSURE_EQ(context, TfLiteOpaqueNodeNumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+  TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
 
-  int num_dims = NumDimensions(input);
+  int num_dims = TfLiteOpaqueTensorNumDimensions(input);
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(num_dims);
-  for (int i=0; i<num_dims; ++i) {
+  for (int i = 0; i < num_dims; ++i) {
     output_size->data[i] = input->dims->data[i];
   }
 
-  return context->ResizeTensor(context, output, output_size);
+  return TfLiteOpaqueContextResizeTensor(context, output, output_size);
 }
 
-TfLiteStatus AtanEval(TfLiteContext* context, TfLiteNode* node) {
-  using namespace tflite;
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+static TfLiteStatus AtanEval(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node) {
+  const TfLiteOpaqueTensor* input = TfLiteOpaqueNodeGetInput(context, node, 0);
+  TfLiteOpaqueTensor* output = TfLiteOpaqueNodeGetOutput(context, node, 0);
 
-  float* input_data = GetTensorData<float>(input);
-  float* output_data = GetTensorData<float>(output);
+  float* input_data = static_cast<float*>(TfLiteOpaqueTensorData(input));
+  float* output_data = static_cast<float*>(TfLiteOpaqueTensorData(output));
 
   size_t count = 1;
-  int num_dims = NumDimensions(input);
+  int num_dims = TfLiteOpaqueTensorNumDimensions(input);
   for (int i = 0; i < num_dims; ++i) {
     count *= input->dims->data[i];
   }
 
-  for (size_t i=0; i<count; ++i) {
+  for (size_t i = 0; i < count; ++i) {
     output_data[i] = atan(input_data[i]);
   }
   return kTfLiteOk;
 }
 
-const TfLiteRegistration* Register_ATAN() {
-  static const TfLiteRegistration r = {nullptr, nullptr, AtanPrepare, AtanEval};
-  return &r;
+static const TfLiteRegistrationExternal* AtanOpCreate() {
+  TfLiteRegistrationExternal* r = TfLiteRegistrationExternalCreate(
+          kTfLiteBuiltinCustom, "ATAN", /*version=*/ 1);
+  TfLiteRegistrationExternalSetPrepare(r, Prepare);
+  TfLiteRegistrationExternalSetInvoke(r, Eval);
+  return r;
+}
+
+const TfLiteRegistrationExternal* AtanOpRegistrationExternal() {
+  // Singleton instance, intentionally never destroyed.
+  static const TfLiteRegistrationExternal* atan_op = AtanOpCreate();
+  return atan_op;
 }
-```
+
+const TfLiteRegistration AtanOpRegistration() {
+  static const TfLiteRegistration atan_op {
+    .registration_external = AtanOpRegistrationExternal();
+  };
+  return atan_op;
+}
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
 
 When initializing the `OpResolver`, add the custom op into the resolver (see
 below for an example). This will register the operator with Tensorflow Lite so
@@ -262,6 +523,11 @@ class OpResolver {
 };
 ```
 
+Note that for backwards compatibility, this class uses the older concrete type
+`TfLiteRegistration` rather than the opaque type `TfLiteRegistrationExternal`,
+but the `TfLiteRegistration` struct contains a `registration_external` field of
+type `TfLiteRegistrationExternal*`.
+
 The `MutableOpResolver` and `BuiltinOpResolver` classes are derived from
 `OpResolver`:
 
@@ -281,7 +547,8 @@ class BuiltinOpResolver : public MutableOpResolver {
 };
 ```
 
-Regular usage requires that you use the `BuiltinOpResolver` and write:
+Regular usage (without custom ops) requires that you use the `BuiltinOpResolver`
+and write:
 
 ```c++
 tflite::ops::builtin::BuiltinOpResolver resolver;
@@ -294,7 +561,7 @@ and call `AddCustom` (before you pass the resolver to the
 ```c++
 tflite::ops::builtin::MutableOpResolver resolver;
 resolver.AddAll(tflite::ops::builtin::BuiltinOpResolver());
-resolver.AddCustom("Atan", Register_ATAN());
+resolver.AddCustom("Atan", AtanOpRegistration());
 ```
 
 If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
@@ -332,20 +599,64 @@ call (as show above) to
     of copying as much as possible.
 
 2.  If a data structure will persist during the entire operation, we advise
-    pre-allocating the memory using temporary tensors. You may need to use
+    pre-allocating the memory using temporary tensors. You may need to use an
     OpData struct to reference the tensor indices in other functions. See the
     example in the
     [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/conv.cc).
-    A sample code snippet is below
-
-    ```
-    auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(1);
-    node->temporaries->data[0] = op_data->temp_tensor_index;
-    TfLiteTensor* temp_tensor = &context->tensors[op_data->temp_tensor_index];
-    temp_tensor->type =  kTfLiteFloat32;
-    temp_tensor->allocation_type = kTfLiteArenaRw;
+    A sample code snippet is below.
+
+    ```c++
+    struct MyOpData {
+      int temp_tensor_index;
+      ...
+    };
+
+    void* Init(TfLiteOpaqueContext* context,
+        const char* buffer, size_t length) {
+      auto* op_data = new MyOpData{};
+      ...
+      return op_data;
+    }
+    void Free(TfLiteOpaqueContext* context, void* buffer) {
+      ...
+      delete reinterpret_cast<MyOpData*>(buffer);
+    }
+    TfLiteStatus Prepare(TfLiteOpaqueContext* context,
+                         TfLiteOpaqueNode* node) {
+      ...
+      auto* op_data =
+          reinterpret_cast<MyOpData*>(TfLiteOpaqueNodeGetUserData(node));
+      const int num_temporaries = 1;
+      int temporary_tensor_indices[num_temporaries];
+      TfLiteOpaqueTensorBuilder* builder = TfLiteOpaqueTensorBuilderCreate();
+      TfLiteOpaqueTensorBuilderSetType(builder, kTfLiteFloat32);
+      TfLiteOpaqueTensorBuilderSetAllocationType(builder, kTfLiteArenaRw);
+      TfLiteOpaqueContextAddTensor(context, builder,
+          &temporary_tensor_indices[0]);
+      TfLiteOpaqueTensorBuilderDelete(builder);
+      TfLiteOpaqueNodeSetTemporaries(node, temporary_tensor_indices,
+          num_temporaries);
+      op_data->temp_tensor_index = temporary_tensor_indices[0];
+      ...
+      return kTfLiteOk;
+    }
+    TfLiteStatus Invoke(TfLiteOpaqueContext* context,
+                        TfLiteOpaqueNode* node) {
+      ...
+      auto* op_data = reinterpret_cast<MyOpData*>(
+          TfLiteOpaqueNodeGetUserData(node));
+      TfLiteOpaqueTensor* temp_tensor =
+          TfLiteOpaqueContextGetOpaqueTensor(context,
+              op_data->temp_tensor_index);
+      TF_LITE_OPAQUE_ENSURE(context,
+          TfLiteTensorType(temp_tensor) == kTfLiteFloat32);
+      TF_LITE_OPAQUE_ENSURE(context,
+          TfLiteTensorGetAllocationType(temp_Tensor) == kTfLiteArenaRw);
+      void *temp_data = TfLiteTensorData(temp_tensor);
+      TF_LITE_OPAQUE_ENSURE(context, temp_data != nullptr);
+      ...
+      return kTfLiteOk;
+    }
     ```
 
 3.  If it doesn't cost too much wasted memory, prefer using a static fixed size
@@ -363,7 +674,7 @@ call (as show above) to
     `malloc` in a function and have an error exit, deallocate memory before you
     exit.
 
-6.  Use `TF_LITE_ENSURE(context, condition)` to check for a specific condition.
-    Your code must not leave memory hanging when `TF_LITE_ENSURE` is used, i.e.,
-    these macros should be used before any resources are allocated that will
-    leak.
+6.  Use `TF_LITE_OPAQUE_ENSURE(context, condition)` to check for a specific
+    condition. Your code must not leave memory hanging when
+    `TF_LITE_OPAQUE_ENSURE` is used, i.e., these macros should be used before
+    any resources are allocated that will leak.
diff --git a/tensorflow/lite/g3doc/tools/BUILD b/tensorflow/lite/g3doc/tools/BUILD
index bbde7a9e01a1df..2c9a83b8ccec65 100644
--- a/tensorflow/lite/g3doc/tools/BUILD
+++ b/tensorflow/lite/g3doc/tools/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_binary")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary")
+load("//tensorflow:strict.default.bzl", "py_strict_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index fd5bdd9e3e4e1e..66d51b14f85edc 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -1,6 +1,9 @@
 # TensorFlow Lite for iOS
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 load(
     "//tensorflow/lite/ios:ios.bzl",
     "TFL_MINIMUM_OS_VERSION",
@@ -8,9 +11,6 @@ load(
     "tflite_ios_framework",
     "tflite_ios_xcframework",
 )
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
-load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     default_visibility = [
@@ -97,7 +97,7 @@ strip_common_include_path_prefix(
         "//tensorflow/lite/core/c:c_api_opaque.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:common.h",
-        "//tensorflow/lite/core/c:registration_external.h",
+        "//tensorflow/lite/core/c:operator.h",
         "//tensorflow/lite/core/async/c:types.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
@@ -124,8 +124,8 @@ tflite_ios_framework(
         ":c_api_opaque.h",
         ":c_api_types.h",
         ":common.h",
+        ":operator.h",
         ":profiler.h",
-        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -178,8 +178,8 @@ ios_static_framework(
         ":c_api_opaque.h",
         ":c_api_types.h",
         ":common.h",
+        ":operator.h",
         ":profiler.h",
-        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -262,7 +262,7 @@ cc_library(
         "//tensorflow/lite/core/c:c_api_opaque.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:common.h",
-        "//tensorflow/lite/core/c:registration_external.h",
+        "//tensorflow/lite/core/c:operator.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.h b/tensorflow/lite/ios/TensorFlowLiteC.h
index 63a687ea306d30..ae72f47b895583 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.h
+++ b/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_opaque.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/core/c/registration_external.h"
+#include "tensorflow/lite/core/c/operator.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 
diff --git a/tensorflow/lite/ios/ios.bzl b/tensorflow/lite/ios/ios.bzl
index a123f2d998209b..2320d016557595 100644
--- a/tensorflow/lite/ios/ios.bzl
+++ b/tensorflow/lite/ios/ios.bzl
@@ -1,10 +1,10 @@
 """TensorFlow Lite Build Configurations for iOS"""
 
-load("//tensorflow:tensorflow.bzl", "clean_dep")
 load("@build_bazel_rules_apple//apple:apple.bzl", "apple_static_xcframework")
 
 # Placeholder for Google-internal load statements.
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow:tensorflow.bzl", "clean_dep")
 
 # LINT.IfChange
 TFL_MINIMUM_OS_VERSION = "12.0"
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index ae7a0fd7dad92f..46e01dda5b49c6 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -1,13 +1,13 @@
 # Description:
 # TensorFlow Lite Java API.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_linkopts", "tflite_linkopts_no_undefined")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "android_library_with_tflite", "cc_library_with_tflite", "java_library_with_tflite", "java_test_with_tflite", "jni_binary_with_tflite")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_android_library")
 load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni", "aar_without_jni")
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 9445fee50db352..1130c3d1b5e432 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2245,6 +2245,7 @@ cc_test(
         ":builtin_ops",
         ":test_main",
         ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 57c65247f8788d..e345a12aeb529c 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/common.h"
@@ -35,6 +37,17 @@ namespace {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
+void copyCast(const float* in, int32_t* out, int num_elements) {
+  float min_int_float =
+      std::nextafterf((float)std::numeric_limits<int32_t>::min(), 0);
+  float max_int_float =
+      std::nextafterf((float)std::numeric_limits<int32_t>::max(), 0);
+  std::transform(in, in + num_elements, out, [=](float a) {
+    return static_cast<int32_t>(
+        std::max(std::min(a, max_int_float), min_int_float));
+  });
+}
+
 template <typename FromT, typename ToT>
 void copyCast(const FromT* in, ToT* out, int num_elements) {
   std::transform(in, in + num_elements, out,
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index c2eef57197119b..d83c26b0b224ce 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
+#include <limits>
 #include <random>
 #include <vector>
 
@@ -59,6 +60,15 @@ TEST(CastOpModel, CastInt4ToFloatLarge) {
   }
 }
 
+TEST(CastOpModel, CastFloatToInt32Infinity) {
+  CastOpModel m({TensorType_FLOAT32, {2}}, {TensorType_INT32, {2}});
+  m.PopulateTensor<float>(m.input(), {std::numeric_limits<float>::infinity(),
+                                      -std::numeric_limits<float>::infinity()});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({2147483520, -2147483520}));
+}
+
 TEST(CastOpModel, CastInt16ToFloat) {
   CastOpModel m({TensorType_INT16, {2, 3}}, {TensorType_FLOAT32, {2, 3}});
   m.PopulateTensor<int16_t>(m.input(), {100, 200, 300, 400, 500, 600});
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 793fcd9dab8219..d1ccdd7fad8c45 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -55,8 +55,8 @@ constexpr int kBatchSize = 1;
 constexpr int kNumDetectionsPerClass = 100;
 
 // Object Detection model produces axis-aligned boxes in two formats:
-// BoxCorner represents the lower left corner (xmin, ymin) and
-// the upper right corner (xmax, ymax).
+// BoxCorner represents the upper left corner (xmin, ymin) and
+// the lower right corner (xmax, ymax).
 // CenterSize represents the center (xcenter, ycenter), height and width.
 // BoxCornerEncoding and CenterSizeEncoding are related as follows:
 // ycenter = y / y_scale * anchor.h + anchor.y;
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 0c92b0e2641b85..670f7a771da7ee 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -118,6 +118,19 @@ TfLiteStatus VerifyPerChannelQuantization(TfLiteContext* context,
   return affine_quantization->scale->size > 1 ? kTfLiteOk : kTfLiteError;
 }
 
+TfLiteStatus VerifyQuantizationZeroPoint(const TfLiteTensor* tensor,
+                                         int expected_value) {
+  const auto* params =
+      reinterpret_cast<TfLiteAffineQuantization*>(tensor->quantization.params);
+  if (params && params->zero_point &&
+      std::any_of(params->zero_point->data,
+                  params->zero_point->data + params->zero_point->size,
+                  [expected_value](int v) { return v != expected_value; })) {
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // This file has four implementations of FullyConnected
@@ -640,6 +653,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
+  if (filter->type == kTfLiteInt4) {
+    TF_LITE_ENSURE_MSG(
+        context,
+        kTfLiteOk == VerifyQuantizationZeroPoint(filter, /*expected_value=*/0),
+        "Unsupported filter quantization zero-point value.");
+  }
   return PrepareImpl(context, node, kernel_type);
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected_4bit_test.cc b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
index 6130af553b284c..f4deb0f2017219 100644
--- a/tensorflow/lite/kernels/fully_connected_4bit_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/fully_connected.h"
 #include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -105,7 +106,7 @@ TEST(Hybrid4BitFullyConnectedOpTest, SimpleTestHybridInt4) {
   FullyConnected4BitOpModel m(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}},
       {
           -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
@@ -148,7 +149,7 @@ TEST(Hybrid4BitFullyConnectedOpTest, TestHybridInt4AllZeroBatch) {
   FullyConnected4BitOpModel m(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}},
       {
           -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
@@ -211,7 +212,7 @@ TEST_P(Hybrid4BitFullyConnectedVsReferenceOpTests, TestHybridInt4) {
   FullyConnected4BitOpModel test(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
       ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT(),
       ActivationFunctionType_RELU);
@@ -222,7 +223,7 @@ TEST_P(Hybrid4BitFullyConnectedVsReferenceOpTests, TestHybridInt4) {
   FullyConnected4BitOpModel expected(
       units, batches,
       /*input=*/{TensorType_FLOAT32, {batches, cols}},
-      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 0.0, 1.0},
       /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
       ops::builtin::Register_FULLY_CONNECTED_REF(),
       ActivationFunctionType_RELU);
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 1bbf1516f82c3f..8b103baeda2423 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1006,6 +1006,7 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "//tensorflow/lite/kernels:op_macros",
+        "@com_google_absl//absl/base:prefetch",
         "@ruy//ruy/profiler:instrumentation",
     ],
 )
@@ -1506,9 +1507,7 @@ filegroup(
     srcs = glob([
         "optimized/*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
@@ -1516,9 +1515,7 @@ filegroup(
     srcs = glob([
         "reference/*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 filegroup(
@@ -1526,9 +1523,7 @@ filegroup(
     srcs = glob([
         "*.h",
     ]),
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
 )
 
 transitive_hdrs(
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 014bb2fe531d1e..c4331e099a21d7 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #endif
 #ifdef __AVX2__
 #include <immintrin.h>
+
+#include "absl/base/prefetch.h"
 #endif
 
 #include <cstdint>
@@ -217,8 +219,11 @@ void Avx2MatrixBatchVectorMultiplyAccumulateImpl(
       // Initialize the dot product sum for the row to 0.
       __m256i dotprod_32x8 = _mm256_setzero_si256();
       std::intptr_t col = 0;
+      constexpr int prefetch_distance = 704;
       // For every block of 32x 8-bit inputs.
       while (col < (m_cols & ~31)) {
+        absl::PrefetchToLocalCache(vectors + col + prefetch_distance);
+        absl::PrefetchToLocalCache(row_ptr + col + prefetch_distance);
         const __m256i vec_16x16 =
             _mm256_loadu_si256(reinterpret_cast<const __m256i*>(vectors + col));
         const __m256i row_16x16 =
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 00bd42bec0c4a3..902f42cca68c5b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -2185,7 +2185,13 @@ BENCHMARK(BM_DotprodBatchOneMultiply)
     ->Args({640, 2048, 8, 8})
     ->Args({2048, 2048, 1, 1})
     ->Args({2048, 2048, 1, 8})
-    ->Args({2048, 2048, 8, 1});
+    ->Args({2048, 2048, 8, 1})
+    ->Args({4096, 4096, 8, 1})
+    ->Args({4096, 4096, 1, 8})
+    ->Args({8192, 8192, 8, 1})
+    ->Args({8192, 8192, 1, 8})
+    ->Args({16384, 16384, 8, 1})
+    ->Args({16384, 16384, 1, 8});
 
 void BM_DotprodBatchFourMultiply(benchmark::State& state) {
   const int rows = state.range(0);
@@ -2242,7 +2248,14 @@ BENCHMARK(BM_DotprodBatchFourMultiply)
     ->Args({2048, 2048, 4, 1})
     ->Args({2048, 2048, 4, 8})
     ->Args({2048, 2048, 5, 1})
-    ->Args({2048, 2048, 8, 1});
+    ->Args({2048, 2048, 8, 1})
+    ->Args({2048, 2048, 64, 1})
+    ->Args({2048, 2048, 1024, 1})
+    ->Args({4096, 4096, 1024, 1})
+    ->Args({8192, 8192, 1024, 1})
+    ->Args({8192, 8192, 1024, 8})
+    ->Args({16384, 16384, 1024, 1})
+    ->Args({16384, 8192, 1024, 1});
 
 void BM_DotprodSparseMultiply(benchmark::State& state) {
   const int rows = state.range(0);
diff --git a/tensorflow/lite/kernels/internal/utils/BUILD b/tensorflow/lite/kernels/internal/utils/BUILD
index c11cc252e97670..d1fdf50b761963 100644
--- a/tensorflow/lite/kernels/internal/utils/BUILD
+++ b/tensorflow/lite/kernels/internal/utils/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/kernels/parse_example/BUILD b/tensorflow/lite/kernels/parse_example/BUILD
index a4ad08d79a001f..b18d23b6c7d607 100644
--- a/tensorflow/lite/kernels/parse_example/BUILD
+++ b/tensorflow/lite/kernels/parse_example/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_features_nolayering_check_if_ios")
-load("//tensorflow/lite:special_rules.bzl", "nonportable_visibility_allowlist")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "nonportable_visibility_allowlist")
 
 # Kernel for custom parse_example
 package(
diff --git a/tensorflow/lite/kernels/perception/BUILD b/tensorflow/lite/kernels/perception/BUILD
index f08f103df7eb02..52e5e9421babbd 100644
--- a/tensorflow/lite/kernels/perception/BUILD
+++ b/tensorflow/lite/kernels/perception/BUILD
@@ -22,13 +22,13 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "@flatbuffers",
     ],
@@ -46,9 +46,11 @@ cc_test(
         ":perception_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:test_main",
         "//tensorflow/lite/kernels:test_util",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -68,7 +70,7 @@ pybind_extension(
     deps = [
         ":perception_ops",
         "//tensorflow/lite:mutable_op_resolver",
-        "//third_party/python_runtime:headers",
+        "//tensorflow/lite/c:common",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/kernels/perception/dense_image_warp.cc b/tensorflow/lite/kernels/perception/dense_image_warp.cc
index 1dd6170bfb608a..b469897b34d34f 100644
--- a/tensorflow/lite/kernels/perception/dense_image_warp.cc
+++ b/tensorflow/lite/kernels/perception/dense_image_warp.cc
@@ -15,13 +15,12 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/dense_image_warp_test.cc b/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
index 4f65a1bf2cd516..7d89d79b3b1635 100644
--- a/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
+++ b/tensorflow/lite/kernels/perception/dense_image_warp_test.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
index 521deb3799408d..d1b924066b23e3 100644
--- a/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
+++ b/tensorflow/lite/kernels/perception/max_pool_with_argmax.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
diff --git a/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc b/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
index a35917803ef700..082851d59c4488 100644
--- a/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
+++ b/tensorflow/lite/kernels/perception/max_pool_with_argmax_test.cc
@@ -17,11 +17,14 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
index e87f7f6e0c248d..869a9457a9f49d 100644
--- a/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
+++ b/tensorflow/lite/kernels/perception/max_unpooling_2d.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
 namespace tflite {
 namespace ops {
 namespace custom {
diff --git a/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc b/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
index 8ac9d15283f114..784a3f4127a814 100644
--- a/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
+++ b/tensorflow/lite/kernels/perception/max_unpooling_2d_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/builtin_op_data.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 #include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/perception/perception_ops.cc b/tensorflow/lite/kernels/perception/perception_ops.cc
index faf4557aa5df4a..a4e62c90b76e57 100644
--- a/tensorflow/lite/kernels/perception/perception_ops.cc
+++ b/tensorflow/lite/kernels/perception/perception_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
 
+#include "tensorflow/lite/mutable_op_resolver.h"
+
 namespace tflite {
 namespace ops {
 namespace custom {
diff --git a/tensorflow/lite/kernels/perception/perception_ops.h b/tensorflow/lite/kernels/perception/perception_ops.h
index e70d64b016055d..121c844b6e2a5a 100644
--- a/tensorflow/lite/kernels/perception/perception_ops.h
+++ b/tensorflow/lite/kernels/perception/perception_ops.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
 #define TENSORFLOW_LITE_KERNELS_PERCEPTION_PERCEPTION_OPS_H_
 
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc b/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
index b1c1c813962f95..cd5e96eceacfdd 100644
--- a/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
+++ b/tensorflow/lite/kernels/perception/perception_ops_wrapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "tensorflow/lite/kernels/perception/perception_ops.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 PYBIND11_MODULE(pywrap_perception_ops, m) {
   m.doc() = R"pbdoc(
diff --git a/tensorflow/lite/kernels/shim/test_op/BUILD b/tensorflow/lite/kernels/shim/test_op/BUILD
index ecb8b7e3b9f70b..c0a35b6a4655ff 100644
--- a/tensorflow/lite/kernels/shim/test_op/BUILD
+++ b/tensorflow/lite/kernels/shim/test_op/BUILD
@@ -1,7 +1,7 @@
 # A simple op. for testing and demonstrating the OpKernel interface.
 
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_kernel_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/mutable_op_resolver_utils.cc b/tensorflow/lite/mutable_op_resolver_utils.cc
index d0d03c68d9ab20..fec9571746c824 100644
--- a/tensorflow/lite/mutable_op_resolver_utils.cc
+++ b/tensorflow/lite/mutable_op_resolver_utils.cc
@@ -21,18 +21,16 @@ limitations under the License.
 
 namespace tflite {
 
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op, int min_version,
-           int max_version) {
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op,
+           int min_version, int max_version) {
   TfLiteRegistration registration{};
-  registration.builtin_code = TfLiteRegistrationExternalGetBuiltInCode(op);
-  registration.custom_name = TfLiteRegistrationExternalGetCustomName(op);
-  registration.version = TfLiteRegistrationExternalGetVersion(op);
-  // This const cast is safe because TfLiteRegistrationExternal is an opaque
-  // type and TfLiteRegistrationExternal objects are always allocated with
-  // TfLiteRegistrationExternalCreate() which allocates non-const objects.
-  registration.registration_external =
-      const_cast<TfLiteRegistrationExternal*>(op);
+  registration.builtin_code = TfLiteOperatorGetBuiltInCode(op);
+  registration.custom_name = TfLiteOperatorGetCustomName(op);
+  registration.version = TfLiteOperatorGetVersion(op);
+  // This const cast is safe because TfLiteOperator is an opaque
+  // type and TfLiteOperator objects are always allocated with
+  // TfLiteOperatorCreate() which allocates non-const objects.
+  registration.registration_external = const_cast<TfLiteOperator*>(op);
   if (registration.custom_name != nullptr) {
     mutable_op_resolver->AddCustom(registration.custom_name, &registration,
                                    min_version, max_version);
@@ -42,9 +40,8 @@ void AddOp(MutableOpResolver* mutable_op_resolver,
   }
 }
 
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op) {
-  int version = TfLiteRegistrationExternalGetVersion(op);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op) {
+  int version = TfLiteOperatorGetVersion(op);
   AddOp(mutable_op_resolver, op, version, version);
 }
 
diff --git a/tensorflow/lite/mutable_op_resolver_utils.h b/tensorflow/lite/mutable_op_resolver_utils.h
index 70ae56caec48d7..905b8a49792d72 100644
--- a/tensorflow/lite/mutable_op_resolver_utils.h
+++ b/tensorflow/lite/mutable_op_resolver_utils.h
@@ -23,15 +23,13 @@ namespace tflite {
 
 /// Registers (the specified version of) the operator `op`.
 /// Replaces any previous registration for the same operator version.
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op);
 
 /// Registers the specified version range (versions `min_version` to
 /// `max_version`, inclusive) of the specified operator `op`.
 /// Replaces any previous registration for the same operator version.
-void AddOp(MutableOpResolver* mutable_op_resolver,
-           const TfLiteRegistrationExternal* op, int min_version,
-           int max_version);
+void AddOp(MutableOpResolver* mutable_op_resolver, const TfLiteOperator* op,
+           int min_version, int max_version);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/mutable_op_resolver_utils_test.cc b/tensorflow/lite/mutable_op_resolver_utils_test.cc
index 8484d35ec48410..d9d9e0320486db 100644
--- a/tensorflow/lite/mutable_op_resolver_utils_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_utils_test.cc
@@ -39,21 +39,20 @@ TfLiteStatus DummyPrepare(TfLiteOpaqueContext* context,
   return kTfLiteOk;
 }
 
-TfLiteRegistrationExternal* GetDummyRegistration() {
-  static TfLiteRegistrationExternal* registration = []() {
-    auto* r =
-        TfLiteRegistrationExternalCreate(kTfLiteBuiltinCustom, "dummy", 1);
-    TfLiteRegistrationExternalSetPrepare(r, DummyPrepare);
-    TfLiteRegistrationExternalSetInvoke(r, DummyInvoke);
+TfLiteOperator* GetDummyRegistration() {
+  static TfLiteOperator* registration = []() {
+    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinCustom, "dummy", 1);
+    TfLiteOperatorSetPrepare(r, DummyPrepare);
+    TfLiteOperatorSetInvoke(r, DummyInvoke);
     return r;
   }();
   return registration;
 }
 
-TfLiteRegistrationExternal* GetAdditionOpRegistration() {
-  static TfLiteRegistrationExternal* registration = []() {
-    auto* r = TfLiteRegistrationExternalCreate(kTfLiteBuiltinAdd, nullptr, 1);
-    TfLiteRegistrationExternalSetInvoke(r, DummyInvoke);
+TfLiteOperator* GetAdditionOpRegistration() {
+  static TfLiteOperator* registration = []() {
+    auto* r = TfLiteOperatorCreate(kTfLiteBuiltinAdd, nullptr, 1);
+    TfLiteOperatorSetInvoke(r, DummyInvoke);
     return r;
   }();
   return registration;
@@ -69,11 +68,10 @@ TEST_F(MutableOpResolverTest, FindOp) {
       resolver.FindOp(BuiltinOperator_ADD, 1);
   ASSERT_NE(found_registration, nullptr);
   EXPECT_TRUE(found_registration->registration_external->invoke == DummyInvoke);
-  EXPECT_EQ(TfLiteRegistrationExternalGetBuiltInCode(
-                found_registration->registration_external),
-            kTfLiteBuiltinAdd);
-  EXPECT_EQ(TfLiteRegistrationExternalGetVersion(
-                found_registration->registration_external),
+  EXPECT_EQ(
+      TfLiteOperatorGetBuiltInCode(found_registration->registration_external),
+      kTfLiteBuiltinAdd);
+  EXPECT_EQ(TfLiteOperatorGetVersion(found_registration->registration_external),
             1);
   EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_ADD);
   EXPECT_EQ(found_registration->version, 1);
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 1f1e5e1d8e6d72..847cfada1ad4ad 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/objc/BUILD.apple b/tensorflow/lite/objc/BUILD.apple
index ef9ec8372db42d..856ed56ad35967 100644
--- a/tensorflow/lite/objc/BUILD.apple
+++ b/tensorflow/lite/objc/BUILD.apple
@@ -1,8 +1,8 @@
 # TensorFlow Lite for Objective-C
 
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
index b71f830d07eceb..623908feb1d7f3 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
@@ -47,7 +47,7 @@ Pod::Spec.new do |s|
       tfl_dir + 'c/c_api_types.h',
       tfl_dir + 'c/common.h',
       tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
-      tfl_dir + 'core/c/registration_external.h',
+      tfl_dir + 'core/c/operator.h',
     ]
     core.exclude_files = [
       objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',
diff --git a/tensorflow/lite/objc/apis/TFLTensor.h b/tensorflow/lite/objc/apis/TFLTensor.h
index cd60b2144a0e6c..deaf52f9e5843f 100644
--- a/tensorflow/lite/objc/apis/TFLTensor.h
+++ b/tensorflow/lite/objc/apis/TFLTensor.h
@@ -52,6 +52,9 @@ typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
 
   /** 64-bit double precision floating point. */
   TFLTensorDataTypeFloat64,
+
+  /** 16-bit bfloat16 floating point. */
+  TFLTensorDataTypeBFloat16,
 };
 
 /**
diff --git a/tensorflow/lite/objc/sources/TFLCommonUtil.mm b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
index 57362ceabb6597..8f9e37ebb421b6 100644
--- a/tensorflow/lite/objc/sources/TFLCommonUtil.mm
+++ b/tensorflow/lite/objc/sources/TFLCommonUtil.mm
@@ -32,6 +32,8 @@ TFLTensorDataType TFLTensorDataTypeFromCTensor(const TfLiteTensor *cTensor) {
       return TFLTensorDataTypeFloat32;
     case kTfLiteFloat16:
       return TFLTensorDataTypeFloat16;
+    case kTfLiteBFloat16:
+      return TFLTensorDataTypeBFloat16;
     case kTfLiteFloat64:
       return TFLTensorDataTypeFloat64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index ce6e9e4973f702..9b716cdffb17c9 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -336,6 +336,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteComplex128";
     case kTfLiteFloat16:
       return "kTfLiteFloat16";
+    case kTfLiteBFloat16:
+      return "kTfLiteBFloat16";
     case kTfLiteFloat64:
       return "kTfLiteFloat64";
     case kTfLiteResource:
diff --git a/tensorflow/lite/profiling/telemetry/BUILD b/tensorflow/lite/profiling/telemetry/BUILD
index 513c2a6253175e..ac8ea8854ddf9f 100644
--- a/tensorflow/lite/profiling/telemetry/BUILD
+++ b/tensorflow/lite/profiling/telemetry/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/profiling/telemetry/c/BUILD b/tensorflow/lite/profiling/telemetry/c/BUILD
index a02a5207d0b663..e05618beae4ce4 100644
--- a/tensorflow/lite/profiling/telemetry/c/BUILD
+++ b/tensorflow/lite/profiling/telemetry/c/BUILD
@@ -1,9 +1,9 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_cc_library_with_c_headers_test",
     "tflite_copts",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index f4f8b9fe4db33a..7a5cd8ff42dc42 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,6 +1,6 @@
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_contrib_test", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
 
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 6ae3a1724f5202..32fc17a1ce5ae3 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -595,6 +595,7 @@ def build_conversion_flags(
     reduce_type_precision=False,
     qdq_conversion_mode=None,
     disable_per_channel_quantization_for_dense_layers=False,
+    enable_composite_direct_lowering=False,
     **_,
 ):
   """Builds protocol buffer describing a conversion of a model.
@@ -724,6 +725,8 @@ def build_conversion_flags(
     disable_per_channel_quantization_for_dense_layers: If set, disables per
       channel end enables per tensor integer quantization for weights in Dense
       layers. The flag works only for integer quantized model.
+    enable_composite_direct_lowering: If set, attempts to lower composite ops
+      directly to tflite ops.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -844,6 +847,9 @@ def build_conversion_flags(
   conversion_flags.disable_per_channel_quantization_for_dense_layers = (
       disable_per_channel_quantization_for_dense_layers
   )
+  conversion_flags.enable_composite_direct_lowering = (
+      enable_composite_direct_lowering
+  )
   return conversion_flags
 
 
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index 77933ee4110018..463204b05753ab 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -396,6 +396,7 @@ def __init__(
       experimental_op_resolver_type=OpResolverType.AUTO,
       experimental_preserve_all_tensors=False,
       experimental_disable_delegate_clustering=False,
+      experimental_default_delegate_latest_features=False,
   ):
     """Constructor.
 
@@ -437,6 +438,8 @@ def __init__(
         this flag is currently experimental, and it might be removed/updated if
         the TF Lite converter doesn't drop such control dependencies in the
         model. Default is False.
+      experimental_default_delegate_latest_features: If true, default delegates
+        may enable all flag protected features. Default is False;
 
     Raises:
       ValueError: If the interpreter was unable to create.
@@ -454,6 +457,12 @@ def __init__(
       raise ValueError('Unrecognized passed in op resolver type: {}'.format(
           experimental_op_resolver_type))
 
+    if num_threads is not None:
+      if not isinstance(num_threads, int):
+        raise ValueError('type of num_threads should be int')
+      if num_threads < 1:
+        raise ValueError('num_threads should >= 1')
+
     if model_path and not model_content:
       custom_op_registerers_by_name = [
           x for x in self._custom_op_registerers if isinstance(x, str)
@@ -468,6 +477,8 @@ def __init__(
           custom_op_registerers_by_func,
           experimental_preserve_all_tensors,
           experimental_disable_delegate_clustering,
+          int(num_threads or 1),
+          experimental_default_delegate_latest_features,
       )
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
@@ -489,19 +500,14 @@ def __init__(
           custom_op_registerers_by_func,
           experimental_preserve_all_tensors,
           experimental_disable_delegate_clustering,
+          int(num_threads or 1),
+          experimental_default_delegate_latest_features,
       )
     elif not model_content and not model_path:
       raise ValueError('`model_path` or `model_content` must be specified.')
     else:
       raise ValueError('Can\'t both provide `model_path` and `model_content`')
 
-    if num_threads is not None:
-      if not isinstance(num_threads, int):
-        raise ValueError('type of num_threads should be int')
-      if num_threads < 1:
-        raise ValueError('num_threads should >= 1')
-      self._interpreter.SetNumThreads(num_threads)
-
     # Each delegate is a wrapper that owns the delegates that have been loaded
     # as plugins. The interpreter wrapper will be using them, but we need to
     # hold them in a list so that the lifetime is preserved at least as long as
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index fa0af673063325..e56b861a5143c7 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -13,6 +13,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
@@ -38,6 +39,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/kernels:reference_ops",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index bce64edb32f8ba..6c6fa7c8f3282e 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/kernels/register.h"
 #include "tensorflow/lite/core/model.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
@@ -85,18 +86,31 @@ using python_utils::PyDecrefDeleter;
 std::unique_ptr<Interpreter> CreateInterpreter(
     const InterpreterWrapper::Model* model,
     const tflite::MutableOpResolver& resolver, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   if (!model) {
     return nullptr;
   }
 
   ::tflite::python::ImportNumpy();
 
+  TfLiteDelegate* xnnpack_delegate = nullptr;
+  if (default_delegate_latest_features) {
+    auto opts = TfLiteXNNPackDelegateOptionsDefault();
+    opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+    opts.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_SUBGRAPH_RESHAPING;
+    opts.num_threads = num_threads;
+    xnnpack_delegate = TfLiteXNNPackDelegateCreate(&opts);
+  }
   std::unique_ptr<Interpreter> interpreter;
   InterpreterOptions options;
   options.SetPreserveAllTensors(preserve_all_tensors);
   options.SetDisableDelegateClustering(disable_delegate_clustering);
   InterpreterBuilder builder(*model, resolver, &options);
+  if (default_delegate_latest_features) {
+    builder.AddDelegate(xnnpack_delegate);
+  }
+  builder.SetNumThreads(num_threads);
   if (builder(&interpreter) != kTfLiteOk) {
     return nullptr;
   }
@@ -200,29 +214,36 @@ InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   if (!model) {
     *error_msg = error_reporter->message();
     return nullptr;
   }
 
   std::unique_ptr<tflite::MutableOpResolver> resolver;
-  switch (op_resolver_id) {
-    case kBuiltinOpResolver:
-      resolver = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
-      break;
-    case kBuiltinRefOpResolver:
-      resolver = std::make_unique<tflite::ops::builtin::BuiltinRefOpResolver>();
-      break;
-    case kBuiltinOpResolverWithoutDefaultDelegates:
-      resolver = std::make_unique<
-          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
-      break;
-    default:
-      // This should not never happen because the eventual caller in
-      // interpreter.py should have passed a valid id here.
-      TFLITE_DCHECK(false);
-      return nullptr;
+  if (default_delegate_latest_features) {
+    resolver = std::make_unique<
+        tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
+  } else {
+    switch (op_resolver_id) {
+      case kBuiltinOpResolver:
+        resolver = std::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+        break;
+      case kBuiltinRefOpResolver:
+        resolver =
+            std::make_unique<tflite::ops::builtin::BuiltinRefOpResolver>();
+        break;
+      case kBuiltinOpResolverWithoutDefaultDelegates:
+        resolver = std::make_unique<
+            tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
+        break;
+      default:
+        // This should not never happen because the eventual caller in
+        // interpreter.py should have passed a valid id here.
+        TFLITE_DCHECK(false);
+        return nullptr;
+    }
   }
 
   for (const auto& registerer : registerers_by_name) {
@@ -232,9 +253,9 @@ InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
   for (const auto& registerer : registerers_by_func) {
     registerer(reinterpret_cast<uintptr_t>(resolver.get()));
   }
-  auto interpreter =
-      CreateInterpreter(model.get(), *resolver, preserve_all_tensors,
-                        disable_delegate_clustering);
+  auto interpreter = CreateInterpreter(
+      model.get(), *resolver, preserve_all_tensors, disable_delegate_clustering,
+      num_threads, default_delegate_latest_features);
   if (!interpreter) {
     *error_msg = error_reporter->message();
     return nullptr;
@@ -806,14 +827,16 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<InterpreterWrapper::Model> model =
       Model::BuildFromFile(model_path, error_reporter.get());
   return CreateInterpreterWrapper(
       std::move(model), op_resolver_id, std::move(error_reporter),
       registerers_by_name, registerers_by_func, error_msg, preserve_all_tensors,
-      disable_delegate_clustering);
+      disable_delegate_clustering, num_threads,
+      default_delegate_latest_features);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
@@ -822,7 +845,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
     bool preserve_all_tensors, bool disable_delegate_clustering) {
   return CreateWrapperCPPFromFile(
       model_path, op_resolver_id, registerers, {} /*registerers_by_func*/,
-      error_msg, preserve_all_tensors, disable_delegate_clustering);
+      error_msg, preserve_all_tensors, disable_delegate_clustering,
+      /*num_threads=*/1, /*default_delegate_latest_features=*/false);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
@@ -830,7 +854,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
     const std::vector<std::string>& registerers_by_name,
     const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
     std::string* error_msg, bool preserve_all_tensors,
-    bool disable_delegate_clustering) {
+    bool disable_delegate_clustering, int num_threads,
+    bool default_delegate_latest_features) {
   char* buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
@@ -843,16 +868,18 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   return CreateInterpreterWrapper(
       std::move(model), op_resolver_id, std::move(error_reporter),
       registerers_by_name, registerers_by_func, error_msg, preserve_all_tensors,
-      disable_delegate_clustering);
+      disable_delegate_clustering, num_threads,
+      default_delegate_latest_features);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
     PyObject* data, int op_resolver_id,
     const std::vector<std::string>& registerers, std::string* error_msg,
     bool preserve_all_tensors, bool disable_delegate_clustering) {
-  return CreateWrapperCPPFromBuffer(data, op_resolver_id, registerers, {},
-                                    error_msg, preserve_all_tensors,
-                                    disable_delegate_clustering);
+  return CreateWrapperCPPFromBuffer(
+      data, op_resolver_id, registerers, {}, error_msg, preserve_all_tensors,
+      disable_delegate_clustering, /*num_threads=*/1,
+      /*default_delegate_latest_features=*/false);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensors() {
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 6d8809bdffb5ca..66e4d8126312b0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -57,7 +57,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   // SWIG caller takes ownership of pointer.
   static InterpreterWrapper* CreateWrapperCPPFromBuffer(
@@ -69,7 +70,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   ~InterpreterWrapper();
   PyObject* AllocateTensors(int subgraph_index);
@@ -126,7 +128,8 @@ class InterpreterWrapper {
       const std::vector<std::string>& registerers_by_name,
       const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
       std::string* error_msg, bool preserve_all_tensors,
-      bool disable_delegate_clustering);
+      bool disable_delegate_clustering, int num_threads,
+      bool default_delegate_latest_features);
 
   InterpreterWrapper(std::unique_ptr<Model> model,
                      std::unique_ptr<PythonErrorReporter> error_reporter,
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
index 5f8f2a50e13f81..728b16c6d19001 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper_pybind11.cc
@@ -53,12 +53,14 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
       [](const std::string& model_path, int op_resolver_id,
          const std::vector<std::string>& registerers_by_name,
          const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
-         bool preserve_all_tensors, bool disable_delegate_clustering) {
+         bool preserve_all_tensors, bool disable_delegate_clustering,
+         int num_threads, bool default_delegate_latest_features) {
         std::string error;
         auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromFile(
             model_path.c_str(), op_resolver_id, registerers_by_name,
             registerers_by_func, &error, preserve_all_tensors,
-            disable_delegate_clustering);
+            disable_delegate_clustering, num_threads,
+            default_delegate_latest_features);
         if (!wrapper) {
           throw std::invalid_argument(error);
         }
@@ -82,12 +84,14 @@ PYBIND11_MODULE(_pywrap_tensorflow_interpreter_wrapper, m) {
       [](const py::bytes& data, int op_resolver_id,
          const std::vector<std::string>& registerers_by_name,
          const std::vector<std::function<void(uintptr_t)>>& registerers_by_func,
-         bool preserve_all_tensors, bool disable_delegate_clustering) {
+         bool preserve_all_tensors, bool disable_delegate_clustering,
+         int num_threads, bool default_delegate_latest_features) {
         std::string error;
         auto* wrapper = ::InterpreterWrapper::CreateWrapperCPPFromBuffer(
             data.ptr(), op_resolver_id, registerers_by_name,
             registerers_by_func, &error, preserve_all_tensors,
-            disable_delegate_clustering);
+            disable_delegate_clustering, num_threads,
+            default_delegate_latest_features);
         if (!wrapper) {
           throw std::invalid_argument(error);
         }
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index 0e07563702fcb0..45146cf88b0616 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
 
-#include <memory>
-
 namespace tflite {
 namespace python {
 
@@ -38,6 +39,9 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_FLOAT32;
     case kTfLiteFloat16:
       return NPY_FLOAT16;
+    case kTfLiteBFloat16:
+      // TODO(b/329491949): NPY_BFLOAT16 currently doesn't exist
+      return NPY_FLOAT16;
     case kTfLiteFloat64:
       return NPY_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
index acc3dbd9fdab3a..e04418c32df7f4 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.h
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -40,7 +40,8 @@ limitations under the License.
 // translation unit boundaries.
 //
 // For more info see https://sourceforge.net/p/numpy/mailman/message/5700519
-// See also tensorflow/tsl/python/lib/core/numpy.h for a similar approach.
+// See also tensorflow/compiler/xla/tsl/python/lib/core/numpy.h for a similar
+// approach.
 #define PY_ARRAY_UNIQUE_SYMBOL _tflite_numpy_api
 #ifndef TFLITE_IMPORT_NUMPY
 #define NO_IMPORT_ARRAY
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index c6804b0f35ed18..9b91a640bc7923 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -665,6 +665,7 @@ def __init__(self):
     self._experimental_reduce_type_precision = False
     self._experimental_qdq_conversion_mode = None
     self._experimental_disable_per_channel_quantization_for_dense_layers = False
+    self._experimental_enable_composite_direct_lowering = False
 
     # Debug parameters
     self.ir_dump_dir = None
@@ -819,6 +820,9 @@ def _get_base_converter_args(self):
         "disable_per_channel_quantization_for_dense_layers": (
             self._experimental_disable_per_channel_quantization_for_dense_layers
         ),
+        "enable_composite_direct_lowering": (
+            self._experimental_enable_composite_direct_lowering
+        ),
     }
 
     if self.saved_model_dir:
@@ -864,7 +868,12 @@ def _get_base_converter_args(self):
                     )
                 ],
                 enable_per_channel_quantized_weight=True,
-            )
+                enable_full_int_quantization=True,
+            ),
+            # For ODML use cases, uniform quantized types should be left intact.
+            pipeline_config=qc.PipelineConfig(
+                unpack_quantized_types=False,
+            ),
         )
 
         args["quantization_config"] = quantization_config
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 8f6b4d3863df0d..0258eb2dcefd15 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -1,7 +1,8 @@
-# Placeholder: load py_proto_library
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "if_portable", "pybind_extension")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index ffccf71a40635e..65f5dfe49d51ca 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -114,6 +114,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;
     case kTfLiteFloat16:
       return TensorType_FLOAT16;
+    case kTfLiteBFloat16:
+      return TensorType_BFLOAT16;
     case kTfLiteFloat64:
       return TensorType_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 40417f5d6e9421..b339e0c107b32d 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,6 +1,4 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/lite:build_def.bzl", "tf_to_tflite", "tflite_copts")
-load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_custom_op_py_strict_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
@@ -8,6 +6,8 @@ load(
     "tf_gen_op_wrapper_py",
     "tf_opts_nortti_if_android",
 )
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_custom_op_py_strict_library")
+load("//tensorflow/lite:build_def.bzl", "tf_to_tflite", "tflite_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -81,9 +81,7 @@ cc_library(
     name = "test_delegate",
     testonly = 1,
     srcs = ["test_delegate.cc"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/lite/core/c:common",
     ],
@@ -160,9 +158,7 @@ cc_library(
     name = "test_registerer",
     srcs = ["test_registerer.cc"],
     hdrs = ["test_registerer.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:builtin_ops",
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 246791ec392c45..3195a5016aa118 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite", "tflite_schema_utils_friends")
 
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 382462f938d93b..fe9ee4c11cc5c9 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -58,6 +58,7 @@ enum TensorType : byte {
   UINT32 = 15,
   UINT16 = 16,
   INT4 = 17,
+  BFLOAT16 = 18,
 }
 
 // Custom quantization parameters for experimenting with new quantization
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b416555e837c3f..79d78c1fc84341 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -700,11 +700,12 @@ enum TensorType : int8_t {
   TensorType_UINT32 = 15,
   TensorType_UINT16 = 16,
   TensorType_INT4 = 17,
+  TensorType_BFLOAT16 = 18,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_INT4
+  TensorType_MAX = TensorType_BFLOAT16
 };
 
-inline const TensorType (&EnumValuesTensorType())[18] {
+inline const TensorType (&EnumValuesTensorType())[19] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -723,13 +724,14 @@ inline const TensorType (&EnumValuesTensorType())[18] {
     TensorType_VARIANT,
     TensorType_UINT32,
     TensorType_UINT16,
-    TensorType_INT4
+    TensorType_INT4,
+    TensorType_BFLOAT16
   };
   return values;
 }
 
 inline const char * const *EnumNamesTensorType() {
-  static const char * const names[19] = {
+  static const char * const names[20] = {
     "FLOAT32",
     "FLOAT16",
     "INT32",
@@ -748,13 +750,14 @@ inline const char * const *EnumNamesTensorType() {
     "UINT32",
     "UINT16",
     "INT4",
+    "BFLOAT16",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameTensorType(TensorType e) {
-  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_INT4)) return "";
+  if (::flatbuffers::IsOutRange(e, TensorType_FLOAT32, TensorType_BFLOAT16)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesTensorType()[index];
 }
diff --git a/tensorflow/lite/swift/BUILD.apple b/tensorflow/lite/swift/BUILD.apple
index 0102871e2bc50e..292c2ce94e4c63 100644
--- a/tensorflow/lite/swift/BUILD.apple
+++ b/tensorflow/lite/swift/BUILD.apple
@@ -1,10 +1,10 @@
 # TensorFlow Lite for Swift
 
-load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:apple.bzl", "apple_static_xcframework")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework", "ios_unit_test")
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
+load("//tensorflow/lite:special_rules.bzl", "ios_visibility_allowlist", "tflite_ios_lab_runner")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 863ca7c6b72170..1ffa19e4e7b65d 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -106,7 +106,7 @@ _test_size_override = {
             "@local_tsl//tsl/platform:env",
             "@local_tsl//tsl/platform:status",
             "@local_tsl//tsl/platform:subprocess",
-            "@local_tsl//tsl/util:command_line_flags",
+            "@local_xla//xla/tsl/util:command_line_flags",
         ],
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib",
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index 43c0c0851d5e10..35e6737137a6ac 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "re2/re2.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/subprocess.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index c5b5a50cac6e2d..66718eb70c185f 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -1,8 +1,4 @@
 # Placeholder: load py_proto_library
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -10,6 +6,10 @@ load(
     "tf_copts",
 )
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index 83daab2357364a..4f6efd6d03a31c 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -1,13 +1,14 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-)
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index e7a31365a65330..1760841a333f6a 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -41,7 +41,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 63.
+// Next ID to use: 64.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -356,4 +356,8 @@ message TocoFlags {
   // inconsistent with Conv 1x1 which always performs per channel quantization.
   optional bool disable_per_channel_quantization_for_dense_layers = 62
       [default = false];
+
+  // Enables the attempt to directly lower composites into tflite ops.
+  // WARNING: Experimental interface, subject to change.
+  optional bool enable_composite_direct_lowering = 63 [default = false];
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 704021241d6e07..8c60f8ad012bd8 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -1,14 +1,15 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
-load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -408,6 +409,8 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         ":logging",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels:kernel_util",
     ],
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index 6702c1f9d352c6..fc2a1be282f985 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -21,7 +21,7 @@ populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
   FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
 )
 list(APPEND TFLITE_BENCHMARK_SRCS
-  ${TSL_SOURCE_DIR}/tsl/util/stats_calculator.cc
+  ${XLA_SOURCE_DIR}/xla/tsl/util/stats_calculator.cc
   ${TFLITE_SOURCE_DIR}/kernels/internal/utils/sparsity_format_converter.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_usage_monitor.cc
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
index 1d1482e0aec5bf..4ea40dbe232b65 100644
--- a/tensorflow/lite/tools/benchmark/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 9fd9aac8508f90..77b20d1b9b1db8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -73,7 +73,7 @@ BenchmarkParams BenchmarkModel::DefaultParams() {
                   BenchmarkParam::Create<bool>(false));
   params.AddParam("memory_footprint_check_interval_ms",
                   BenchmarkParam::Create<int32_t>(kMemoryCheckIntervalMs));
-  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(-1));
+  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(1));
   return params;
 }
 
@@ -207,10 +207,10 @@ void BenchmarkModel::LogParams() {
   LOG_BENCHMARK_PARAM(int32_t, "memory_footprint_check_interval_ms",
                       "Memory footprint check interval (ms)", verbose);
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  LOG_BENCHMARK_PARAM(
-      int32_t, "gpu_invoke_loop_times",
-      "Number of GPU delegate invoke loop iterations to divide latency by",
-      verbose);
+  LOG_BENCHMARK_PARAM(int32_t, "gpu_invoke_loop_times",
+                      "Number of GPU delegate invoke loop iterations. Latency "
+                      "will be divided by it.",
+                      verbose);
 #endif
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index ecce8cb1572046..1645c7ec50fd4c 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -149,7 +149,7 @@ BenchmarkParams BenchmarkPerformanceOptions::DefaultParams() {
                   BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("random_shuffle_benchmark_runs",
                   BenchmarkParam::Create<bool>(true));
-  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(-1));
+  params.AddParam("gpu_invoke_loop_times", BenchmarkParam::Create<int32_t>(1));
   return params;
 }
 
@@ -250,7 +250,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<int32_t>("num_threads", 1);
   single_option_run_params_->Set<bool>("use_gpu", false);
 #ifdef TFLITE_GPU_ENABLE_INVOKE_LOOP
-  single_option_run_params_->Set<int32_t>("gpu_invoke_loop_times", -1);
+  single_option_run_params_->Set<int32_t>("gpu_invoke_loop_times", 1);
   single_option_run_params_->Set<bool>("require_full_delegation", false);
 #endif
 #if defined(__ANDROID__)
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/BUILD b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
index edefa5e6e35158..2a7a93671c265d 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/c/BUILD
@@ -31,6 +31,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
index d2cc7f65ceb9e0..2f07561e42c416 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
+++ b/tensorflow/lite/tools/benchmark/experimental/c/benchmark_c_api.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tsl/util/stats_calculator.h"
 
 extern "C" {
 
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
index 8f944837c3c15c..fd8657c8b9a8fa 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/models/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #  Holds model-specific files. The app will bundle the files into assets.
 
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_models", "latency_benchmark_extra_models")
 load("//tensorflow/lite/experimental/acceleration/mini_benchmark:build_defs.bzl", "validation_model")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_models", "latency_benchmark_extra_models")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:proto.bzl", "proto_data")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
index 5209526ab6712c..d2d1f807604ab5 100644
--- a/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/src/main/native/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #  Holds the native layer of the app.
 
-load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_deps", "latency_benchmark_extra_deps")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite", "jni_binary_with_tflite")
+load("//tensorflow/lite/tools/benchmark/experimental/delegate_performance/android:build_defs.bzl", "accuracy_benchmark_extra_deps", "latency_benchmark_extra_deps")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
index a03a18fcbe7ab3..36193f17394955 100644
--- a/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/experimental/firebase/android/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   BenchmarkModel Android harness for Firebase Test Lab.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_nn_skel_libraries")
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
index 86289ca2e846c9..fef7eff82bf8f1 100644
--- a/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/tools/benchmark/experimental/ios/BUILD.apple
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "strip_common_include_path_prefix")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION", "strip_common_include_path_prefix")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index e252870628271c..bc5a62d3180590 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG db84c8c822a70183e10ef6b7402c6730f5d54da5
+  GIT_TAG 65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index 93dc242a8e2bfd..38e8fa56632784 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -81,7 +81,7 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
       CreateFlag<int32_t>(
           "gpu_invoke_loop_times", params,
           "Number of GPU delegate invoke loop iterations. Used only when "
-          "TFLITE_GPU_ENABLE_INVOKE_LOOP is defined. Default is -1."),
+          "TFLITE_GPU_ENABLE_INVOKE_LOOP is defined. Default is 1."),
       CreateFlag<std::string>(
           "delegate_serialize_dir", params,
           "Directory to be used by delegates for serializing any model data. "
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 2ebe3a67bd729e..b2eb3a2ee0cdde 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -49,7 +49,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_jpeg_internal",
@@ -118,7 +118,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@com_google_absl//absl/base:core_headers",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
@@ -172,7 +172,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:evaluation_config_cc_proto",
         "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_cc_proto",
         "@FP16",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
index 5de1c1bcf96288..1e7d36e098fb5c 100644
--- a/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/image_preprocessing_stage.h
@@ -21,12 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
 #include "tensorflow/lite/tools/evaluation/proto/preprocessing_steps.pb.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
index f81a7c3b2a1f3d..5f1bee82d33a35 100644
--- a/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/inference_profiler_stage.h
@@ -21,11 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
 #include "tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index b9451c2ee17f99..e6f2436f739b12 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/core/kernels/register.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
 #include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tflite {
 namespace evaluation {
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
index 37177ac5316cc8..1db59b4821e75e 100644
--- a/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/BUILD.apple
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework")
+load("//tensorflow/lite/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 89e21a86805331..dbdddcedecf06f 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,10 +1,10 @@
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/lite:build_def.bzl",
     "tflite_copts",
 )
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index f199ba272b1a33..d2ba8553c62e74 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -1,6 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/serialization/enum_mapping.h b/tensorflow/lite/tools/serialization/enum_mapping.h
index 574b1ee3e21cf7..d218b66258581f 100644
--- a/tensorflow/lite/tools/serialization/enum_mapping.h
+++ b/tensorflow/lite/tools/serialization/enum_mapping.h
@@ -64,6 +64,8 @@ inline TensorType TfLiteTypeToSchemaType(TfLiteType type) {
       return TensorType_FLOAT32;
     case kTfLiteFloat16:
       return TensorType_FLOAT16;
+    case kTfLiteBFloat16:
+      return TensorType_BFLOAT16;
     case kTfLiteFloat64:
       return TensorType_FLOAT64;
     case kTfLiteInt32:
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index 161dcd1554d04b..26b7acb7665d4a 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -1,9 +1,9 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension", "replace_with_portable_tf_lib_when_required")
 
 # Utilities for signature_defs in TFLite
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pybind_extension", "replace_with_portable_tf_lib_when_required")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/lite/tools/tool_params_test.cc b/tensorflow/lite/tools/tool_params_test.cc
index 248db53b0a4d42..e34c40b3cba143 100644
--- a/tensorflow/lite/tools/tool_params_test.cc
+++ b/tensorflow/lite/tools/tool_params_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/tool_params.h"
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 846f76471f2ce1..12396ed7c3ce05 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <complex>
 #include <random>
 
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/tools/logging.h"
 
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index b5d8bb151e7145..e6044fc6881990 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -1045,8 +1045,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 2;
     case BuiltinOperator_CAST:
-      if (op_sig.inputs.at(0).type == kTfLiteInt4 &&
-          op_sig.outputs.at(0).type == kTfLiteFloat32) {
+      if (op_sig.inputs.at(0).type == kTfLiteBFloat16 ||
+          op_sig.outputs.at(0).type == kTfLiteBFloat16) {
+        return 7;
+      } else if (op_sig.inputs.at(0).type == kTfLiteInt4 &&
+                 op_sig.outputs.at(0).type == kTfLiteFloat32) {
         return 6;
       } else if (op_sig.inputs.at(0).type == kTfLiteFloat64 ||
                  op_sig.outputs.at(0).type == kTfLiteFloat64 ||
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index d0d385a310d732..cecda0e5eb44a1 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -118,6 +118,9 @@ TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
     case kTfLiteFloat16:
       *bytes = sizeof(TfLiteFloat16);
       break;
+    case kTfLiteBFloat16:
+      *bytes = sizeof(TfLiteBFloat16);
+      break;
     case kTfLiteFloat64:
       *bytes = sizeof(double);
       break;
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 563714c9810e3a..6cfd11e5d688d8 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -164,7 +164,6 @@ tf_staging/tensorflow/tools/lib_package/libtensorflow_test.sh:
 tf_staging/tensorflow/tools/pip_package/BUILD:
 tf_staging/tensorflow/tools/pip_package/MANIFEST.in:
 tf_staging/tensorflow/tools/pip_package/README:
-tf_staging/tensorflow/tools/pip_package/build_pip_package.sh:
 tf_staging/tensorflow/tools/pip_package/check_load_py_test:.py
 tf_staging/tensorflow/tools/pip_package/pip_smoke_test:.py
 tf_staging/tensorflow/tools/pip_package/setup:.py
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 8b51ed5d808e6a..159e6db204f0cb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2,6 +2,10 @@
 # Public targets:
 #  ":platform" - Low-level and platform-specific Python code.
 
+load(
+    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
 
@@ -19,10 +23,6 @@ load(
     "tf_additional_plugin_deps",
     "tf_additional_profiler_deps",
 )
-load(
-    "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 # TODO(mdan): Break into per-directory files.
 
@@ -834,9 +834,9 @@ pywrap_tensorflow_macro(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/profiler/rpc:profiler_server_impl",
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
-        "@local_tsl//tsl/python/lib/core:numpy",
         "@local_xla//xla/backends/profiler/cpu:python_tracer",
         "@local_xla//xla/stream_executor:stream_executor_impl",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ] + select({
         "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
         "//conditions:default": [
@@ -951,9 +951,9 @@ filegroup(
         "@local_tsl//tsl/profiler/lib:profiler_session_impl",  # profiler
         "@local_tsl//tsl/profiler/rpc:profiler_server_impl",  # profiler
         "@local_tsl//tsl/profiler/rpc/client:profiler_client_impl",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
-        "@local_tsl//tsl/python/lib/core:numpy",  # checkpoint_reader
         "@local_xla//xla/stream_executor",  # stat_summarizer
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",  # bfloat16, float8_e4m3fn, float8_e5m2
+        "@local_xla//xla/tsl/python/lib/core:numpy",  # checkpoint_reader
     ] + select({
         "//tensorflow/compiler/mlir/python:disable_mlir_config": [],
         "//conditions:default": [
@@ -1151,7 +1151,7 @@ cc_library(
     name = "unified_api_pywrap_required_headers",
     textual_hdrs = [
         "//tensorflow/python/lib/core:basic_hdrs",
-        "@local_tsl//tsl/python/lib/core:basic_hdrs",
+        "@local_xla//xla/tsl/python/lib/core:basic_hdrs",
         "//tensorflow/c:headers",
         "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
@@ -1205,7 +1205,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/util:util_hdr",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     dynamic_deps = [":_pywrap_tensorflow_internal.so"] + select({
         "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
diff --git a/tensorflow/python/_pywrap_dtensor_device.pyi b/tensorflow/python/_pywrap_dtensor_device.pyi
index bf8f123c893e0a..0362a8c0f59a99 100644
--- a/tensorflow/python/_pywrap_dtensor_device.pyi
+++ b/tensorflow/python/_pywrap_dtensor_device.pyi
@@ -88,13 +88,13 @@ class Mesh:
     def device_location(self, arg0: int) -> list[int]: ...
     def device_type(self) -> str: ...
     def dim_size(self, dim_name: str) -> int: ...
-    def global_device_ids(self): ...
+    def global_device_ids(self) -> Sequence[int]: ...
     def global_devices(self) -> list[str]: ...
     def host_mesh(self) -> Mesh: ...
     def is_remote(self) -> bool: ...
     def is_single_device(self) -> bool: ...
-    def local_device_ids(self): ...
-    def local_devices(self): ...
+    def local_device_ids(self) -> Sequence[int]: ...
+    def local_devices(self) -> Sequence[str]: ...
     def min_global_device_id(self) -> int: ...
     def num_local_devices(self) -> int: ...
     def shape(self) -> list[int]: ...
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index d3c4b9e07b2bb3..812eb5740f36a1 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -56,9 +56,7 @@ py_strict_library(
 py_strict_library(
     name = "conversion",
     srcs = ["conversion.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/autograph/core:config",
         "//tensorflow/python/autograph/pyct:cache",
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 25dd28737fce2e..9f4f87dad148e4 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -30,9 +30,7 @@ py_strict_library(
 py_strict_library(
     name = "exceptions",
     srcs = ["exceptions.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:control_flow_assert",
@@ -59,9 +57,7 @@ py_strict_library(
 py_strict_library(
     name = "logical",
     srcs = ["logical.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/ops:cond",
@@ -94,9 +90,7 @@ py_strict_library(
 py_strict_library(
     name = "conditional_expressions",
     srcs = ["conditional_expressions.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":control_flow",
         "//tensorflow/python/autograph/utils:tensors",
@@ -139,9 +133,7 @@ py_strict_library(
 py_strict_library(
     name = "slices",
     srcs = ["slices.py"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_util",
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index fe17a30249aefb..845da5ef95991b 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -10,9 +10,9 @@
 # consumers of the tf_gen_op_wrapper_py rule would be simplified if we don't
 # hard code the ops/ directory.
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:py.default.bzl", "py_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 # This is a private function only intended to be used in this directory, no need to
 # document all its args for public consumption.
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 96f66e375cd7ff..8adf234819eebe 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -45,7 +45,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -68,7 +68,7 @@ tf_python_pybind_extension(
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
         "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
@@ -133,11 +133,6 @@ py_strict_library(
     name = "client",
     srcs = ["client_lib.py"],
     srcs_version = "PY3",
-    visibility = [
-        "//tensorflow:internal",
-        "//third_party/mlperf:__subpackages__",
-        "//third_party/py/tf_slim:__subpackages__",
-    ],
     deps = [
         ":_pywrap_device_lib",
         "//tensorflow/core:protos_all_py",
@@ -264,7 +259,7 @@ tf_cuda_library(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 19e04fd1c5b29a..8a9682307c4ff5 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/c/c_api.h"
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 0e7a5efe142b13..7492c91bbdb355 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/c/safe_ptr.h"
 #include "tensorflow/c/tf_buffer.h"
 #include "tensorflow/c/tf_datatype.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -50,7 +51,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace pybind11 {
 namespace detail {
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 5a949daa30884e..75aeecbbc1ab94 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 3, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 4, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/xla/experimental/BUILD b/tensorflow/python/compiler/xla/experimental/BUILD
index 8cc63502e0869a..c2e2dd9d45af60 100644
--- a/tensorflow/python/compiler/xla/experimental/BUILD
+++ b/tensorflow/python/compiler/xla/experimental/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -40,3 +41,27 @@ py_strict_test(
         "@absl_py//absl/testing:absltest",
     ],
 )
+
+tpu_py_strict_test(
+    name = "resource_variable_xla_sharding_test",
+    srcs = ["resource_variable_xla_sharding_test.py"],
+    disable_v3_4chips = False,
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = ["requires-net:external"],
+    deps = [
+        ":xla_sharding",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/tpu:device_assignment",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training:adagrad",
+    ],
+)
diff --git a/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
new file mode 100644
index 00000000000000..cab7c9810063fa
--- /dev/null
+++ b/tensorflow/python/compiler/xla/experimental/resource_variable_xla_sharding_test.py
@@ -0,0 +1,186 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.python.compiler.xla.experimental import xla_sharding
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu import device_assignment
+from tensorflow.python.tpu import tpu
+from tensorflow.python.training import adagrad
+
+
+# Gets all the nodes of `op` in graph that have `input_node_name` as one of the
+# inputs
+def _get_op_nodes_with_input(input_node_name, op, graph):
+  nodes_with_input = []
+  for node in graph.node:
+    nodes_with_input += [
+        node
+        for input in node.input
+        if input == input_node_name and node.op == op
+    ]
+  return nodes_with_input
+
+
+# Gets XlaSharding ops connected to ReadVariableOp for the given variable_name
+def _get_xla_sharding_nodes_for_variable(variable_name, graph):
+  read_variable_op_nodes = _get_op_nodes_with_input(
+      variable_name, 'ReadVariableOp', graph
+  )
+  xla_sharding_op_nodes = []
+  for read_variable_op_node in read_variable_op_nodes:
+    xla_sharding_op_nodes += _get_op_nodes_with_input(
+        read_variable_op_node.name, 'XlaSharding', graph
+    )
+  return xla_sharding_op_nodes
+
+
+def _get_xla_sharding_proto_from_node(node):
+  sharding_proto = xla_sharding.xla_data_pb2.OpSharding()
+  sharding_proto.ParseFromString(node.attr['sharding'].s)
+  return sharding_proto
+
+
+class ResourceVariableXlaShardingTest(test.TestCase):
+
+  def setUp(self) -> None:
+    super().setUp()
+
+    context.enable_xla_sharding_for_resource_variables()
+    self.topology = tpu_cluster_resolver.initialize_tpu_system()
+    if len(config.list_logical_devices('TPU')) != 8:
+      self.skipTest('All tests require 8 TPUs.')
+
+    self.da = device_assignment.DeviceAssignment.build(
+        self.topology, computation_shape=[2, 2, 1, 2], num_replicas=1
+    )
+
+  def test_xla_sharding_ops_created_for_optimizer_slot_variables(self):
+    w = variables.Variable(
+        initial_value=math_ops.range(8, dtype=dtypes.float32),
+        name='w',
+    )
+    self.assertIsInstance(w, resource_variable_ops.BaseResourceVariable)
+    w = xla_sharding.split(
+        w,
+        split_dimension=0,
+        num_devices=8,
+    )
+    sharding_proto = xla_sharding.xla_data_pb2.OpSharding()
+    sharding_proto.ParseFromString(xla_sharding.get_tensor_sharding(w))
+    opt = adagrad.AdagradOptimizer(1.0)
+
+    @def_function.function
+    def computation(x):
+      def tpu_fn(x):
+        y = math_ops.add(w, x)
+        loss = math_ops.reduce_sum(y)
+        opt.minimize(loss, None, [w])
+        return loss
+
+      output = tpu.replicate(tpu_fn, [[x]], device_assignment=self.da)
+      return output
+
+    inputs = array_ops.reshape(math_ops.range(16, dtype=dtypes.float32), (2, 8))
+    result = computation(inputs)
+    self.assertSequenceEqual([[176.0]], self.evaluate(result))
+    graph = computation.get_concrete_function(inputs).graph.as_graph_def()
+
+    update_op_nodes = [
+        node for node in graph.node if node.op == 'ResourceApplyAdagrad'
+    ]
+    self.assertLen(update_op_nodes, 1)
+    update_op_node = update_op_nodes[0]
+
+    var_input_name = update_op_node.input[0]
+    var_sharding_nodes = _get_xla_sharding_nodes_for_variable(
+        var_input_name, graph
+    )
+    self.assertLen(var_sharding_nodes, 1)
+    self.assertProtoEquals(
+        _get_xla_sharding_proto_from_node(var_sharding_nodes[0]), sharding_proto
+    )
+
+    slot_var_input_name = update_op_node.input[1]
+    slot_var_sharding_nodes = _get_xla_sharding_nodes_for_variable(
+        slot_var_input_name, graph
+    )
+    self.assertLen(slot_var_sharding_nodes, 1)
+    self.assertProtoEquals(
+        _get_xla_sharding_proto_from_node(slot_var_sharding_nodes[0]),
+        sharding_proto,
+    )
+
+  def test_disabling_xla_sharding_ops_temporarily(self):
+    w = variables.Variable(
+        initial_value=math_ops.range(8, dtype=dtypes.float32),
+        name='w',
+    )
+    self.assertIsInstance(w, resource_variable_ops.BaseResourceVariable)
+
+    context.enable_xla_sharding_for_resource_variables()
+    with context.temporarily_disable_xla_sharding_for_resource_variables():
+      with self.assertRaisesRegex(
+          AttributeError,
+          '.*Tensor.op is undefined when eager execution is enabled.*',
+      ):
+        xla_sharding.split(
+            w,
+            split_dimension=0,
+            num_devices=8,
+        )
+
+    # xla_sharding_for_resource_variables is enabled again. Following line
+    # doesn't throw an error.
+    xla_sharding.split(
+        w,
+        split_dimension=0,
+        num_devices=8,
+    )
+
+    context.disable_xla_sharding_for_resource_variables()
+    with context.temporarily_disable_xla_sharding_for_resource_variables():
+      with self.assertRaisesRegex(
+          AttributeError,
+          '.*Tensor.op is undefined when eager execution is enabled.*',
+      ):
+        xla_sharding.split(
+            w,
+            split_dimension=0,
+            num_devices=8,
+        )
+
+    # xla_sharding_for_resource_variables stays disabled.
+    with self.assertRaisesRegex(
+        AttributeError,
+        '.*Tensor.op is undefined when eager execution is enabled.*',
+    ):
+      xla_sharding.split(
+          w,
+          split_dimension=0,
+          num_devices=8,
+      )
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 94ee7f74a5edbb..5cf6657ea5d292 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -65,6 +65,7 @@ tf_py_strict_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:testing",
@@ -175,8 +176,10 @@ tf_py_strict_test(
     name = "from_list_test",
     size = "small",
     srcs = ["from_list_test.py"],
+    shard_count = 4,
     deps = [
         "//tensorflow/python/data/experimental/ops:from_list",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
index fae498f0869906..46dbc5fb6a22c2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/assert_cardinality_test.py
@@ -97,8 +97,7 @@ def testIncorrectCardinality(self, num_elements, asserted_cardinality,
               asserted_cardinality=100,
               expected_error=errors.FailedPreconditionError,
               expected_error_message=(
-                  "Input dataset was expected to contain 100 elements but "
-                  "contained only 10 elements.")) +
+                  "Input dataset was expected to contain 100 elements.")) +
           combinations.combine(
               num_elements=10,
               asserted_cardinality=cardinality.INFINITE,
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index e4f33b1ed1571a..c84b889267ff66 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for the private `_AutoShardDataset` transformation."""
 import os
+from typing import Optional
 
 from absl.testing import parameterized
 
@@ -21,6 +22,7 @@
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.experimental.ops import testing
@@ -713,5 +715,48 @@ def build_dataset():
     verify_fn(self, build_dataset, num_outputs=20)
 
 
+class AutoShardGlobalShuffleTest(
+    test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              num_shards=[1, 3, 5],
+              shard_index=[0, 1, 2, 4],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      dataset_range: int,
+      num_shards: int,
+      shard_index: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    if shard_index >= num_shards:
+      return
+
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = distribute._AutoShardDataset(dataset, num_shards, shard_index)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(shard_index, dataset_range, num_shards))
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNotSufficientInput(self):
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = distribute._AutoShardDataset(dataset, 5, 4)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
index 7249aa596caaad..cf07d43195314d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.experimental.from_list()."""
+
 import collections
+from typing import Callable, Optional
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import from_list
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -226,5 +230,74 @@ def testDict(self, verify_fn):
         self, lambda: self._build_list_dataset(dict_elements), num_outputs=3)
 
 
+class FromListGlobalShuffleTest(
+    test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      dataset_range: int,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = from_list.from_list(list(range(dataset_range)))
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(dataset_range)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+
+class FromListGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_range: int,
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = from_list.from_list(list(range(dataset_range)))
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=dataset_range * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 696d943e50dcc9..3151856ff5831d 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -99,6 +99,8 @@ py_strict_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
     ],
@@ -179,6 +181,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/framework:tensor",
         "//tensorflow/python/ops:dataset_ops_gen",
diff --git a/tensorflow/python/data/experimental/ops/distributed_save_op.py b/tensorflow/python/data/experimental/ops/distributed_save_op.py
index 3e986a72681309..bd2b547a355bbc 100644
--- a/tensorflow/python/data/experimental/ops/distributed_save_op.py
+++ b/tensorflow/python/data/experimental/ops/distributed_save_op.py
@@ -14,32 +14,42 @@
 # ==============================================================================
 """Distributed saving of a dataset to disk."""
 
+from typing import Optional
+
 from tensorflow.core.protobuf import snapshot_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 # TODO(b/238903802): Use TypeSpec serialization methods directly.
 from tensorflow.python.saved_model import nested_structure_coder
 
 
 # TODO(b/250921378): Add example to docstring and export to TF API.
-def distributed_save(dataset, path, dispatcher_address, compression="AUTO"):
+def distributed_save(
+    dataset: dataset_ops.Dataset,
+    path: str,
+    dispatcher_address: str,
+    compression: str = "AUTO",
+) -> Optional[ops.OperationType]:
   """Initiates the process of distributedly saving a dataset to disk.
 
   Args:
     dataset: The `tf.data.Dataset` to save.
-    path: A string indicating the filepath of the directory to which to save
-      `dataset`.
-    dispatcher_address: A string indicating the address of the dispatcher for
-      the tf.data service instance used to save `dataset`.
-    compression: (Optional.) A string indicating whether and how to compress the
-      `dataset` materialization.  If `"AUTO"`, the tf.data runtime decides which
-      algorithm to use.  If `"GZIP"` or `"SNAPPY"`, that specific algorithm is
-      used.  If `None`, the `dataset` materialization is not compressed.
+    path: The directory path to save the dataset. Requires that the directory
+      do not exist and will create the directory.
+    dispatcher_address: The address of the tf.data service dispatcher used to
+      save `dataset`.
+    compression: (Optional.) Whether and how to compress the `dataset` snapshot.
+      If `"AUTO"`, the tf.data runtime decides which algorithm to use. If
+      `"GZIP"` or `"SNAPPY"`, that specific algorithm is used.  If `None`, the
+      `dataset` snapshot is not compressed.
 
   Returns:
     An operation which when executed performs the distributed save.
 
   Raises:
     ValueError: If `dispatcher_address` is invalid.
+    tf.errors.AlreadyExistsError: If the snapshot already exists.
   """
   if not isinstance(dispatcher_address, str):
     raise ValueError("`dispatcher_address` must be a string, but is a "
@@ -50,12 +60,10 @@ def distributed_save(dataset, path, dispatcher_address, compression="AUTO"):
   metadata = snapshot_pb2.DistributedSnapshotMetadata(
       element_spec=nested_structure_coder.encode_structure(
           dataset.element_spec).SerializeToString(),
-      compression=compression,
-  )
+      compression=compression)
 
   return gen_experimental_dataset_ops.distributed_save(
       dataset._variant_tensor,  # pylint: disable=protected-access
       directory=path,
       address=dispatcher_address,
-      metadata=metadata.SerializeToString(),
-  )
+      metadata=metadata.SerializeToString())
diff --git a/tensorflow/python/data/experimental/ops/global_shuffle_op.py b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
index 43714f43088911..6527c4980e326e 100644
--- a/tensorflow/python/data/experimental/ops/global_shuffle_op.py
+++ b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.framework import tensor
 from tensorflow.python.ops import gen_dataset_ops
@@ -55,8 +56,9 @@ def _global_shuffle(  # pylint: disable=unused-private-name
     A new `Dataset` where elements are produced in a globally shuffled order.
 
   Raises:
-    InvalidArgumentError if the input dataset does not support random access, or
-    it has infinite or unknown cardinality.
+    - InvalidArgumentError if the input dataset does not support random access,
+      or it has infinite or unknown cardinality.
+    - FailedPreconditionError for batching with `drop_remainder=False`.
   """
   return _GlobalShuffleDataset(
       input_dataset,
@@ -74,6 +76,14 @@ def __init__(
       seed: Optional[Union[int, tensor.Tensor]] = None,
       reshuffle_each_iteration: bool = True,
       name: Optional[str] = None):
+
+    options = options_lib.Options()
+    # Currently, prefetching threads cannot access the runtime context required
+    # for global shuffling when `warm_start` is enabled. Supporting it will be
+    # future work.
+    options.experimental_warm_start = False
+    input_dataset = input_dataset.with_options(options)
+
     self._input_dataset = input_dataset
     self._seed, self._seed2 = random_seed.get_seed(seed)
     self._reshuffle_each_iteration = reshuffle_each_iteration
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 3b63533996613b..bfdb79b802c64b 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -681,11 +681,16 @@ tf_py_strict_test(
 
 tf_py_strict_test(
     name = "list_files_test",
-    size = "small",
+    size = "medium",
     srcs = ["list_files_test.py"],
+    shard_count = 4,
     deps = [
+        ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:test_mode",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
@@ -1173,12 +1178,13 @@ tf_py_strict_test(
 
 tf_py_strict_test(
     name = "skip_test",
-    size = "small",
+    size = "medium",
     srcs = ["skip_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     deps = [
         ":checkpoint_test_base",
         ":test_base",
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index d1fc8ddcfc5b49..d8fe368fb19b77 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -15,7 +15,6 @@
 # ==============================================================================
 """Tests for `tf.data.Dataset.batch()`."""
 
-import math
 import time
 from typing import Callable, Optional
 
@@ -453,19 +452,16 @@ class BatchGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
           test_base.default_test_combinations(),
           combinations.combine(
               dataset_range=[100],
-              batch_size=[2, 7],
-              drop_remainder=[True, False])))
+              batch_size=[2, 7])))
   def testBatch(
-      self, dataset_range: int, batch_size: int, drop_remainder: bool):
+      self, dataset_range: int, batch_size: int):
     dataset = dataset_ops.Dataset.range(dataset_range)
-    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
     dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
     dataset = global_shuffle_op._global_shuffle(dataset)
-    dataset = dataset.map(lambda x: x[0])
+    dataset = dataset.unbatch()
 
-    expected = list(range(0, dataset_range, batch_size))
-    if drop_remainder:
-      expected = expected[: (dataset_range // batch_size)]
+    expected = list(range(0, (dataset_range // batch_size) * batch_size))
     dataset_output = self.getDatasetOutput(
         dataset, requires_initialization=True)
     self.assertCountEqual(dataset_output, expected)
@@ -477,27 +473,23 @@ def testBatch(
           combinations.combine(
               dataset_range=[100],
               batch_size=[2, 7],
-              drop_remainder=[True, False],
               reshuffle=[True, False],
               seed=[None, 42])))
   def testReshuffleRepeatEpochs(
       self,
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       reshuffle: bool,
       seed: Optional[int]):
     dataset = dataset_ops.Dataset.range(dataset_range)
-    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
     dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
     dataset = global_shuffle_op._global_shuffle(
         dataset, seed=seed, reshuffle_each_iteration=reshuffle)
-    dataset = dataset.map(lambda x: x[0])
     dataset = dataset.repeat(2)
+    dataset = dataset.unbatch()
 
-    expected = list(range(0, dataset_range, batch_size))
-    if drop_remainder:
-      expected = expected[: (dataset_range // batch_size)]
+    expected = list(range(0, (dataset_range // batch_size) * batch_size))
     len_per_iteration = len(expected)
     expected *= 2
 
@@ -511,6 +503,24 @@ def testReshuffleRepeatEpochs(
     else:
       self.assertEqual(output_per_iteration[0], output_per_iteration[1])
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[100],
+              batch_size=[2, 7])))
+  def testNoDropRemainder(
+      self, dataset_range: int, batch_size: int):
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = dataset.batch(batch_size, drop_remainder=False)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+
+    with self.assertRaisesRegex(
+        errors.FailedPreconditionError,
+        "does not support global shuffling with `drop_remainder=False`."):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
 
 class BatchGlobalShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
                                        parameterized.TestCase):
@@ -522,22 +532,20 @@ class BatchGlobalShuffleCheckpointTest(checkpoint_test_base.CheckpointTestBase,
           combinations.combine(
               dataset_range=[10],
               batch_size=[2, 3],
-              drop_remainder=[True, False],
               symbolic_checkpoint=[True, False])))
   def testBatch(
       self,
       verify_fn: Callable[..., None],
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       symbolic_checkpoint: bool):
 
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.range(dataset_range)
-      dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+      dataset = dataset.batch(batch_size, drop_remainder=True)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(dataset, seed=42)
-      dataset = dataset.map(lambda x: x[0])
+      dataset = dataset.unbatch()
       options = options_lib.Options()
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
@@ -545,10 +553,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=(
-            dataset_range // batch_size
-            if drop_remainder
-            else math.ceil(dataset_range / batch_size)),
+        num_outputs=(dataset_range // batch_size) * batch_size,
         assert_items_equal=True)
 
   # Creating multiple iterators with the same seed is only supported in v2 API.
@@ -559,7 +564,6 @@ def _build_dataset() -> dataset_ops.Dataset:
           combinations.combine(
               dataset_range=[10],
               batch_size=[2, 3],
-              drop_remainder=[True, False],
               reshuffle_each_iteration=[True, False],
               symbolic_checkpoint=[True, False])))
   def testReshuffleEachIteration(
@@ -567,17 +571,16 @@ def testReshuffleEachIteration(
       verify_fn: Callable[..., None],
       dataset_range: int,
       batch_size: int,
-      drop_remainder: bool,
       reshuffle_each_iteration: bool,
       symbolic_checkpoint: bool):
 
     def _build_dataset() -> dataset_ops.Dataset:
       dataset = dataset_ops.Dataset.range(dataset_range)
-      dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+      dataset = dataset.batch(batch_size, drop_remainder=True)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
-      dataset = dataset.map(lambda x: x[0])
+      dataset = dataset.unbatch()
       options = options_lib.Options()
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
@@ -585,10 +588,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=(
-            dataset_range // batch_size
-            if drop_remainder
-            else math.ceil(dataset_range / batch_size)),
+        num_outputs=(dataset_range // batch_size) * batch_size,
         assert_items_equal=reshuffle_each_iteration)
 
 
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index 47f4223505a1b3..bad5e73bccb664 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -17,11 +17,16 @@
 from os import path
 import shutil
 import tempfile
+from typing import Callable, Optional
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.experimental.ops import global_shuffle_op
+from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.data.ops import test_mode
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -230,5 +235,98 @@ def testMultiplePatternsAsTensor(self):
         assert_items_equal=True)
 
 
+class ListFilesGlobalShuffleTest(ListFilesTest, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def test(
+      self,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'),
+                                             shuffle=False)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = [
+        compat.as_bytes(path.join(self.tmp_dir, filename))
+        for filename in filenames
+    ] * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testShuffleNotSupported(self):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True)
+    with self.assertRaises(errors.FailedPreconditionError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class ListFilesGlobalShuffleCheckpointTest(
+    ListFilesTest,
+    checkpoint_test_base.CheckpointTestBase,
+    parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    # Bypasses the default value for `warm_start`, which is not supported for
+    # global shuffling:
+    # https://github.com/tensorflow/tensorflow/blob/29561af231863afb3b6b89e3aa8a6a550c2b7bb0/tensorflow/python/data/ops/options.py#L633
+    test_mode.toggle_test_mode(False)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def test(
+      self,
+      verify_fn: Callable[..., None],
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    filenames = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
+    self._touchTempFiles(filenames)
+
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'),
+                                               shuffle=False)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=len(filenames) * repetitions,
+        assert_items_equal=reshuffle_each_iteration)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index 9256df3a5224a6..2d00e6bcaf7aa9 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -1887,7 +1887,6 @@ def _build_dataset() -> dataset_ops.Dataset:
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
       options = options_lib.Options()
       options.experimental_optimization.apply_default_optimizations = False
-      options.experimental_warm_start = False
       options.experimental_symbolic_checkpoint = symbolic_checkpoint
       return dataset.with_options(options)
 
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 1405c30d74e4f7..b9ca03cf840220 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -193,9 +193,9 @@ class ShardGlobalShuffleTest(
           test_base.default_test_combinations(),
           combinations.combine(
               dataset_range=[100],
-              num_shards=[1, 5, 13],
-              shard_index=[0, 1, 4],
-              seed=[None, 19],
+              num_shards=[1, 3, 5],
+              shard_index=[0, 1, 2, 4],
+              seed=[None, 42],
               reshuffle_each_iteration=[True, False])))
   def testShard(
       self,
@@ -228,17 +228,26 @@ class ShardGlobalShuffleCheckpointTest(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
           combinations.combine(
+              dataset_range=[10],
+              num_shards=[1, 3],
+              shard_index=[0, 1, 2],
               reshuffle_each_iteration=[True, False],
               symbolic_checkpoint=[True, False])))
   def testShard(
       self,
       verify_fn: Callable[..., None],
+      dataset_range: int,
+      num_shards: int,
+      shard_index: int,
       reshuffle_each_iteration: bool,
       symbolic_checkpoint: bool):
 
+    if shard_index >= num_shards:
+      return
+
     def _build_dataset() -> dataset_ops.Dataset:
-      dataset = dataset_ops.Dataset.range(10)
-      dataset = dataset.shard(3, 0)
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      dataset = dataset.shard(num_shards, shard_index)
       dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
       dataset = global_shuffle_op._global_shuffle(
           dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
@@ -249,7 +258,7 @@ def _build_dataset() -> dataset_ops.Dataset:
     verify_fn(
         self,
         _build_dataset,
-        num_outputs=4,
+        num_outputs=len(range(shard_index, dataset_range, num_shards)),
         assert_items_equal=reshuffle_each_iteration)
 
 
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
index d117ced2b12222..bba8d1e30ca68d 100644
--- a/tensorflow/python/data/kernel_tests/skip_test.py
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -13,9 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for `tf.data.Dataset.skip()`."""
+
+from typing import Callable, Optional
+
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import global_shuffle_op
 from tensorflow.python.data.experimental.ops import random_access
 from tensorflow.python.data.kernel_tests import checkpoint_test_base
 from tensorflow.python.data.kernel_tests import test_base
@@ -124,5 +128,89 @@ def testMultipleCombinations(self, elements, skip):
           self.evaluate(random_access.at(dataset, index=i)), i + skip)
 
 
+class SkipGlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              count=[0, 2],
+              repetitions=[1, 2],
+              seed=[None, 42],
+              reshuffle_each_iteration=[True, False])))
+  def testSkip(
+      self,
+      dataset_range: int,
+      count: int,
+      repetitions: int,
+      seed: Optional[int],
+      reshuffle_each_iteration: bool):
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = dataset.skip(count)
+    dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+    if repetitions > 1:
+      dataset = dataset.repeat(repetitions)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=reshuffle_each_iteration)
+
+    expected = list(range(count, dataset_range)) * repetitions
+    dataset_output = self.getDatasetOutput(
+        dataset, requires_initialization=True)
+    self.assertCountEqual(dataset_output, expected)
+    self.assertNotEqual(dataset_output, expected)
+    self.assertLen(dataset_output, self.evaluate(dataset.cardinality()))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(skip=[-2, -1])))
+  def testNegativeSkip(self, skip: int):
+    dataset = dataset_ops.Dataset.range(10).skip(skip)
+    with self.assertRaises(errors.FailedPreconditionError):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+class SkipGlobalShuffleCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+          combinations.combine(
+              dataset_range=[10],
+              count=[0, 2],
+              repetitions=[1, 2],
+              reshuffle_each_iteration=[True, False],
+              symbolic_checkpoint=[True, False])))
+  def testSkip(
+      self,
+      verify_fn: Callable[..., None],
+      dataset_range: int,
+      count: int,
+      repetitions: int,
+      reshuffle_each_iteration: bool,
+      symbolic_checkpoint: bool):
+    def _build_dataset() -> dataset_ops.Dataset:
+      dataset = dataset_ops.Dataset.range(dataset_range)
+      dataset = dataset.skip(count)
+      dataset = dataset.prefetch(buffer_size=dataset_ops.AUTOTUNE)
+      if repetitions > 1:
+        dataset = dataset.repeat(repetitions)
+      dataset = global_shuffle_op._global_shuffle(
+          dataset, seed=42, reshuffle_each_iteration=reshuffle_each_iteration)
+      options = options_lib.Options()
+      options.experimental_symbolic_checkpoint = symbolic_checkpoint
+      return dataset.with_options(options)
+
+    verify_fn(
+        self,
+        _build_dataset,
+        num_outputs=(dataset_range - count) * repetitions,
+        assert_items_equal=reshuffle_each_iteration,
+    )
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 0257e270a6d203..5534e7487ff827 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -651,6 +651,31 @@ py_strict_test(
     ],
 )
 
+cuda_py_strict_test(
+    name = "mwms_pjrt_gpu_test",
+    srcs = ["mwms_pjrt_gpu_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    xla_enabled = True,
+    deps = [
+        ":multi_process_runner",
+        ":multi_worker_test_base",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+    ],
+)
+
 py_strict_library(
     name = "tpu_strategy",
     srcs = ["tpu_strategy.py"],
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index 26ee3c7b51f59f..5500a86fb64b3b 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
+load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
diff --git a/tensorflow/python/distribute/mwms_pjrt_gpu_test.py b/tensorflow/python/distribute/mwms_pjrt_gpu_test.py
new file mode 100644
index 00000000000000..dc5f85aef2486c
--- /dev/null
+++ b/tensorflow/python/distribute/mwms_pjrt_gpu_test.py
@@ -0,0 +1,116 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import copy
+
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+
+
+# Based on collective_ops_multi_worker_test.py
+def enable_collective_ops(cluster_resolver):
+  context.context().configure_collective_ops(
+      collective_leader="/job:worker/replica:0/task:0")
+  config_proto = copy.deepcopy(context.context().config)
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer or "grpc")
+  context.context().enable_collective_ops(server_def)
+
+
+def configure_coordination_service():
+  context.context().configure_coordination_service(
+      service_type="standalone",
+      service_leader="/job:worker/replica:0/task:0",
+      enable_health_check=False,
+  )
+
+
+class MultiWorkerMirroredStrategyPjRtRemoteGpuTest(
+    multi_worker_test_base.MultiWorkerTestBase, test.TestCase
+):
+  def testRemoteGpusFound(self):
+
+    def worker_fn():
+
+      configure_coordination_service()
+      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+      enable_collective_ops(cluster_resolver=cluster_resolver)
+      context.context().ensure_initialized()
+
+      group_size = 2
+      group_key = 1
+      instance_key1 = 1
+      instance_key2 = 2
+      tensor_size = 10
+
+      # cluster_resolver.task_id is int 0 or 1.
+      tensor_val = [cluster_resolver.task_id + 1.] * tensor_size
+      constant = constant_op.constant(tensor_val)
+
+      @def_function.function(jit_compile=True)
+      def g():
+
+        def f(x):
+          return 2 * x + 1
+
+        input_tensor1 = array_ops.identity(f(constant))
+        input_tensor2 = array_ops.identity(f(constant))
+
+        reduced_tensor1 = collective_ops.all_reduce_v2(
+            input_tensor1, group_size, group_key, instance_key1, "Add", "Id")
+        reduced_tensor2 = collective_ops.all_reduce_v2(
+            input_tensor2, group_size, group_key, instance_key2, "Add", "Id")
+        return reduced_tensor1, reduced_tensor2
+
+      return g()
+
+    num_gpus = len(tf_config.list_physical_devices("GPU"))
+    skip_flag = (num_gpus < 2) or not test_util.is_xla_enabled()
+    if skip_flag:
+      self.skipTest(
+          "This test is intended to test the 2 GPU (1 per worker with 2"
+          " workers) with XLA case (%d GPUs found, using XLA = %s)."
+          % (num_gpus, test_util.is_xla_enabled())
+      )
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, share_gpu=False
+    )
+    mpr.start()
+    mpr_result = mpr.join()
+    self.assertLen(mpr_result.return_value, 2)
+    for rval in mpr_result.return_value:
+      for t in rval:
+        # for IDs 0 and 1: (2*(0+1)+1) + (2*(1+1)+1) = 8
+        self.assertAllClose(t.numpy(), [8., 8., 8., 8., 8., 8., 8., 8., 8., 8.])
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 57dd33a786266e..4e2bf2e4160b67 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.bzl", "check_deps")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_rpc_deps",
@@ -80,7 +80,7 @@ cc_library(
         "@com_google_absl//absl/types:variant",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 0cb1074d251f04..e400fbaa117209 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -150,6 +150,25 @@ def xla_sharding_for_resource_variables_enabled():
   return _XLA_SHARDING_FOR_RESOURCE_VARIABLES
 
 
+@contextlib.contextmanager
+def temporarily_disable_xla_sharding_for_resource_variables():
+  """Temporarily disables XLA sharding for resource variables.
+
+  Should be a no-op if it is already disabled.
+
+  Yields:
+    None.
+  """
+  previously_enabled = xla_sharding_for_resource_variables_enabled()
+
+  try:
+    disable_xla_sharding_for_resource_variables()
+    yield
+  finally:
+    if previously_enabled:
+      enable_xla_sharding_for_resource_variables()
+
+
 # Expose it as internally public APIs for Keras use cases in b/171080602.
 tf_export("__internal__.is_tfrt_enabled", v1=[])(is_tfrt_enabled)
 
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 2160b5a8f91c2d..2492bd4bd06124 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -1,7 +1,7 @@
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -639,6 +639,7 @@ tf_xla_py_strict_test(
     disabled_backends = [
         "cpu_ondemand",
         "gpu_a100",
+        "gpu_h100",
     ],
     enable_mlir_bridge = True,
     python_version = "PY3",
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 167511e80c65ef..6dc8541ef09592 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/eager/pywrap_tensor.h"
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 53c5b66a93fcb9..bebf4e8558c463 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/c/eager/c_api.h"
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 927ddaffcec70c..dc7e9bc6dbb87b 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -93,6 +93,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:protobuf",
     ],
     alwayslink = 1,
 )
@@ -202,7 +203,7 @@ tf_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:strcat",
         "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/util:command_line_flags",
+        "@local_xla//xla/tsl/util:command_line_flags",
     ],
 )
 
@@ -864,7 +865,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -964,7 +965,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
@@ -1108,7 +1109,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/lib/core:safe_pyobject_ptr_required_hdrs",
         "@local_tsl//tsl/distributed_runtime:pywrap_required_hdrs",
         "@local_tsl//tsl/distributed_runtime/coordination:pywrap_required_hdrs",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     enable_stub_generation = True,
     pytype_srcs = [
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 228bacb7f6443d..6ee1860950cc40 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -242,8 +242,8 @@ def set_optimizer_experimental_options(options):
       - implementation_selector: Enable the swap of kernel implementations based
         on the device placement.
       - auto_mixed_precision: Change certain float32 ops to float16 on Volta
-        GPUs and above. Without the use of loss scaling, this can cause
-        numerical underflow (see
+        GPUs and above; and on CPUs with AMX FP16 support. Without the use of
+        loss scaling, this can cause numerical underflow (see
         `keras.mixed_precision.experimental.LossScaleOptimizer`).
       - disable_meta_optimizer: Disable the entire meta optimizer.
       - min_graph_nodes: The minimum number of nodes in a graph to optimizer.
diff --git a/tensorflow/python/framework/offset_counter.cc b/tensorflow/python/framework/offset_counter.cc
index 4dbae6a231a7cb..09a6facfb5c6b7 100644
--- a/tensorflow/python/framework/offset_counter.cc
+++ b/tensorflow/python/framework/offset_counter.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/python/framework/offset_counter_helper.h"
 #include "tensorflow/python/framework/op_reg_offset.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/command_line_flags.h"
 
 inline constexpr absl::string_view kUsage =
     "offset_counter reads C++ source codes, scans for the location of where "
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 2478db37b7f977..e064900002be16 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/framework/python_op_gen_annotator.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -674,7 +675,7 @@ string ShapeToPython(const TensorShapeProto& shape) {
 }
 
 string TensorToPython(const TensorProto& proto) {
-  return proto.ShortDebugString();
+  return tsl::LegacyUnredactedShortDebugString(proto);
 }
 
 string AttrListToPython(const AttrValue& value,
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index 35314d604ffde4..fc6426e9e438bf 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/python/framework/python_op_gen.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/str_util.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index c8068f02af560d..95a370e19a9c79 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,11 +1,12 @@
 # Tests of tf.io.*proto.
 
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_library")
+
+# Placeholder: load py_proto_library
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
+load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 965a2180e04081..b0a5d1d9651bc2 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -1,6 +1,6 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index a6904053c60b76..0ab9f789976e11 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -39,8 +39,8 @@ cc_library(
         "//tensorflow/c:tf_datatype_hdrs",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -171,7 +171,7 @@ cc_library(
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
     alwayslink = 1,
 )
@@ -213,7 +213,7 @@ cc_library(
         "//tensorflow/c:headers",
         "//tensorflow/c:safe_ptr_hdr",
         "//tensorflow/c/eager:headers",
-        "@local_tsl//tsl/python/lib/core:numpy_hdr",
+        "@local_xla//xla/tsl/python/lib/core:numpy_hdr",
     ],
     features = [
         "-parse_headers",
@@ -228,7 +228,7 @@ cc_library(
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -249,8 +249,8 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
-        "@local_tsl//tsl/python/lib/core:ml_dtypes_lib",
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:ml_dtypes_lib",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
@@ -273,7 +273,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
-        "@local_tsl//tsl/python/lib/core:numpy",
+        "@local_xla//xla/tsl/python/lib/core:numpy",
     ],
 )
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index ea38b32266ab63..5096026cbdb1db 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -15,7 +15,7 @@ limitations under the License.
 // Must be included first
 // clang-format off
 #include "tensorflow/c/tf_datatype.h"
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/tfe_context_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/util/port.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
-#include "tsl/python/lib/core/ml_dtypes.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index d45ca6ee0c67d6..c7fa135c82ad7c 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // clang-format off
 // Must be included first.
 #include "tensorflow/c/tf_datatype.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 // clang-format on
 
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
@@ -24,10 +24,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/c/c_api.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/python/lib/core/py_util.h"
-#include "tsl/python/lib/core/ml_dtypes.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.h b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
index ed2da4afc0c230..fe98f8818d46bb 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.h
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include <functional>
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 9eac4c2c207e97..d1b7986c0a998e 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // Must be included first.
 #include "tensorflow/python/lib/core/py_func.h"
 
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 // clang-format: on
 
 #include <Python.h>
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index b572244c74b40f..aeac93b3711984 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Must be included first
 // clang-format off
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index d58d2383402e6d..4978399e29d531 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -1,8 +1,8 @@
 # python/lib/io package
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow:tensorflow.bzl", "if_oss")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 
 # copybara:uncomment_begin(google-only)
 # load("//third_party/zlib:BUILD_defs.bzl", "brittle_test_relying_on_stable_zlib_output")
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 67175cccf64e55..7eaae7d9075c14 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -36,6 +36,7 @@ visibility = [
     "//third_party/reverb:__subpackages__",
     "//tensorflow_minigo:__subpackages__",
     "//research/graph/fairness/inproc_fair_reg:__subpackages__",
+    "//third_party/py/scenic:__subpackages__",
 ]
 
 package(
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index fa87211b113e3b..0012a19a12d047 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 # Description: Sparse CSR support for TensorFlow.
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
diff --git a/tensorflow/python/ops/memory_tests/BUILD b/tensorflow/python/ops/memory_tests/BUILD
index 77d88ed25a552e..7ac2e7882f53aa 100644
--- a/tensorflow/python/ops/memory_tests/BUILD
+++ b/tensorflow/python/ops/memory_tests/BUILD
@@ -1,8 +1,8 @@
 # python/ops/memory_tests package
 
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/ops/sobol_ops_test.py b/tensorflow/python/ops/sobol_ops_test.py
index a795cbdcf4f06e..674e2b73e658c2 100644
--- a/tensorflow/python/ops/sobol_ops_test.py
+++ b/tensorflow/python/ops/sobol_ops_test.py
@@ -140,7 +140,7 @@ def test_non_scalar_input(self):
           skip=constant_op.constant([1])))
 
   @test_util.run_in_graph_and_eager_modes
-  def testDimNumResultsOverflow(self):
+  def test_dim_num_results_overflow(self):
     with self.assertRaisesRegex(
         (ValueError, errors.InvalidArgumentError),
         r'num_results\*dim must be less than 2147483647'):
@@ -148,6 +148,18 @@ def testDimNumResultsOverflow(self):
           gen_math_ops.sobol_sample(
               dim=2560, num_results=16384000, skip=0, dtype=dtypes.float32))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_num_results_skip_overflow(self):
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError),
+        r'num_results\+skip must be less than 2147483647',
+    ):
+      self.evaluate(
+          gen_math_ops.sobol_sample(
+              dim=1, num_results=1, skip=2147483647, dtype=dtypes.float32
+          )
+      )
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index a308a621912917..846b8693c227b9 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "pybind11/attr.h"  // from @pybind11
-#include "tsl/python/lib/core/numpy.h" //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h" //NOLINT
 // clang-format on
 
 #include "Python.h"
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 6a40f1a8292aa0..ad98ab315a7228 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -1,11 +1,11 @@
 # Description:
 # Scripts used to generate TensorFlow Python API.
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES", "TENSORFLOW_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1", "TENSORFLOW_API_INIT_FILES_V1")
-load("//tensorflow/python/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_GEN_PACKAGES")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tools/api/generator2/BUILD b/tensorflow/python/tools/api/generator2/BUILD
index 8c2c1e4c41b9ad..9b739b45c348d3 100644
--- a/tensorflow/python/tools/api/generator2/BUILD
+++ b/tensorflow/python/tools/api/generator2/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tools/api/generator2/generator/BUILD b/tensorflow/python/tools/api/generator2/generator/BUILD
index a8b56d670c353a..8aba8163a63a29 100644
--- a/tensorflow/python/tools/api/generator2/generator/BUILD
+++ b/tensorflow/python/tools/api/generator2/generator/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
 load("//tensorflow:pytype.default.bzl", "pytype_strict_binary", "pytype_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index b1840d29128af6..89e5caf9ad6ef8 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1,3 +1,5 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
 # Description: Operations defined for Cloud TPUs
 load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
@@ -6,7 +8,6 @@ load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/python/tpu:tpu.bzl", "internal_create_sanitizer_settings", "tpu_py_strict_test")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 # Do not add anymore paths here. You do not need to be in the visibility list
 # to use TPU symbols. They are accessible from tf.contrib.tpu in TF 1.x and
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index dfba766ef5fa23..8b9dc8408cb690 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -1,8 +1,9 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 # Placeholder: load py_proto_library
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/core/platform:distribute.bzl", "distribute_py_strict_test")
-load("//tensorflow:strict.default.bzl", "py_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 7536c6ce90692f..5a438ce2d52d05 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -169,7 +169,30 @@ def update_op(self, optimizer, g):
             "Cannot use a constraint function on a sparse variable.")
       return optimizer._resource_apply_sparse_duplicate_indices(
           g.values, self._v, g.indices)
-    update_op = optimizer._resource_apply_dense(g, self._v)
+
+    if context.xla_sharding_for_resource_variables_enabled():
+      # For each slot variable that is annotated with an XLA sharding, we read
+      # the variable and assign the value to itself. This is done to trigger the
+      # creation of an XlaShardingOp when a ReadVariableOp is created upon the
+      # call to `slot_var.read_value()`. This is needed to ensure that slot
+      # variables with XLA sharding are sharded correctly. Please see
+      # b/307541427 for more details.
+      assign_ops = []
+      for variable_dict in optimizer._slots.values():
+        for slot_var in variable_dict.values():
+          if (
+              isinstance(slot_var, resource_variable_ops.BaseResourceVariable)
+              and slot_var._get_xla_sharding() is not None
+          ):
+            assign_ops.append(slot_var.assign(slot_var.read_value()))
+
+      # The assign_ops created above are added as a control dependency for the
+      # update op to make sure these appear before the update_op.
+      with ops.control_dependencies(assign_ops):
+        update_op = optimizer._resource_apply_dense(g, self._v)
+    else:
+      update_op = optimizer._resource_apply_dense(g, self._v)
+
     if self._v.constraint is not None:
       with ops.control_dependencies([update_op]):
         return self._v.assign(self._v.constraint(self._v))
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index e5c0b1b6d29025..01b7664e1c30de 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -162,7 +162,7 @@ tf_python_pybind_extension(
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/util:stats_calculator_portable",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/util/stat_summarizer_wrapper.cc b/tensorflow/python/util/stat_summarizer_wrapper.cc
index e6d00ff355b829..47120b21a24ee9 100644
--- a/tensorflow/python/util/stat_summarizer_wrapper.cc
+++ b/tensorflow/python/util/stat_summarizer_wrapper.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
+#include "xla/tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/util/stat_summarizer.h"
-#include "tsl/util/stat_summarizer_options.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace py = pybind11;
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d6128779eb5bf6..7dbf684d6a078b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -3352,7 +3352,6 @@ def tf_python_pybind_extension_opensource(
     """
     extended_deps = deps + if_mkl_ml(["@local_xla//xla/tsl/mkl:intel_binary_blob"])
     extended_deps += [] if dynamic_deps else if_windows([], ["//tensorflow:libtensorflow_framework_import_lib"]) + tf_binary_pybind_deps()
-    extended_deps += ["@local_xla//xla:bazel_issue_21519"]  # buildifier: disable=list-append
     pybind_extension_opensource(
         name,
         srcs,
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index 22de79201c4010..4cbc61957f6384 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -1,13 +1,13 @@
 # Description:
 #   JNI-based Java inference interface for TensorFlow.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
     "tf_cc_binary",  # @unused
     "tf_copts",
 )
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h b/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
index 8bbd38692b13c4..7020f63ad449e4 100644
--- a/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
+++ b/tensorflow/tools/android/inference_interface/jni/run_stats_jni.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <jni.h>
 
+#include "xla/tsl/util/stats_calculator.h"
 #include "tensorflow/core/util/stat_summarizer.h"
-#include "tsl/util/stats_calculator.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 05ff5b9941c821..980d5261ed9c8b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -150,6 +150,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "populate_pjrt_gpu_client_creation_info"
+        number: 17
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "node_id"
+        number: 18
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
index 6e48ac1ef1cda3..c102725d9e3e82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\', \'experimental_default_delegate_latest_features\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "allocate_tensors"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
index 7edb5900b4b5f4..e00e4c66e47900 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-quantization-options.pbtxt
@@ -96,11 +96,11 @@ tf_proto {
       type_name: ".stablehlo.quantization.CalibrationOptions"
     }
     field {
-      name: "debugger_options"
+      name: "debugger_config"
       number: 16
       label: LABEL_OPTIONAL
       type: TYPE_MESSAGE
-      type_name: ".tensorflow.quantization.DebuggerOptions"
+      type_name: ".stablehlo.quantization.DebuggerConfig"
     }
     nested_type {
       name: "RepresentativeDatasetsEntry"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
index e29e12405deecd..510414a0313081 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'path_map\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'path_map\', \'expected_input_key_map\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
index 6e48ac1ef1cda3..c102725d9e3e82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'model_path\', \'model_content\', \'experimental_delegates\', \'num_threads\', \'experimental_op_resolver_type\', \'experimental_preserve_all_tensors\', \'experimental_disable_delegate_clustering\', \'experimental_default_delegate_latest_features\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'OpResolverType.AUTO\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "allocate_tensors"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
index 7edb5900b4b5f4..e00e4c66e47900 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-quantization-options.pbtxt
@@ -96,11 +96,11 @@ tf_proto {
       type_name: ".stablehlo.quantization.CalibrationOptions"
     }
     field {
-      name: "debugger_options"
+      name: "debugger_config"
       number: 16
       label: LABEL_OPTIONAL
       type: TYPE_MESSAGE
-      type_name: ".tensorflow.quantization.DebuggerOptions"
+      type_name: ".stablehlo.quantization.DebuggerConfig"
     }
     nested_type {
       name: "RepresentativeDatasetsEntry"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
index e29e12405deecd..510414a0313081 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.experimental.-tf-record-representative-dataset-saver.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'path_map\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'path_map\', \'expected_input_key_map\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
index ccc8d201da030f..a2567292ca1bca 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_test_build.sh
@@ -113,9 +113,6 @@ sudo sed -i '/^build --profile/d' /usertools/aarch64_clang.bazelrc
 sudo sed -i '\@^build.*=\"/usr/local/bin/python3\"$@d' /usertools/aarch64_clang.bazelrc
 sed -i '$ aimport /usertools/aarch64_clang.bazelrc' .bazelrc
 
-# Override breaking change in setuptools v60 (https://github.com/pypa/setuptools/pull/2896)
-export SETUPTOOLS_USE_DISTUTILS=stdlib
-
 # Local variables
 WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/whl"
 sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d ${WHL_DIR}
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 383021af7dd342..6afd971397e5f7 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -19,9 +19,8 @@ symbols through this python script.
   * `VS140COMNTOOLS`
 """
 
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
 load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool", "find_vc_path")
 
 def _def_file_filter_configure_impl(repository_ctx):
     if repository_ctx.os.name.lower().find("windows") == -1:
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index f1bb21d9100706..78c42c6f454c5c 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -59,7 +59,7 @@ tensorflow::tfprof::SerializeToString
 [//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool] # graph_analyze
 tensorflow::grappler::graph_analyzer::GraphAnalyzerTool
 
-[//external/local_tsl/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
+[//external/local_xla/xla/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
 tsl::ml_dtypes::RegisterTypes
 tsl::ml_dtypes::GetBfloat16Dtype
 tsl::ml_dtypes::GetFloat8E4m3b11fnuzDtype
@@ -307,7 +307,7 @@ tensorflow::AddWhileInputHack
 tensorflow::RecordMutation
 tensorflow::Graph::IsControlEdge
 
-[//external/local_tsl/tsl/python/lib/core:numpy] # tf_session
+[//external/local_xla/xla/tsl/python/lib/core:numpy] # tf_session
 tsl::ImportNumpy
 _tsl_numpy_api
 
@@ -562,7 +562,8 @@ tensorflow::Safe_PyObjectPtr
 tensorflow::quantization::QuantizeQatModel
 tensorflow::quantization::QuantizePtqModelPreCalibration
 tensorflow::quantization::QuantizePtqModelPostCalibration
-tensorflow::quantization::QuantizePtqDynamicRange
+tensorflow::quantization::QuantizeStaticRangePtq
+tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 28a821e7dc85f5..45081a2a695599 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Doc generator
 
-load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:py.default.bzl", "py_library")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 1129c9412492a8..dd20297b07f3d0 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -109,12 +109,15 @@
   ```
   """
 
-tf.estimator.Estimator = doc_controls.inheritable_header("""\
-  Warning: TensorFlow 2.15 included the final release of the `tf-estimator` 
-  package. Estimators will not be available in TensorFlow 2.16 or after. See the
-  [migration guide](https://www.tensorflow.org/guide/migrate/migrating_estimator)
-  for more information about how to convert off of Estimators."
-  """)(tf.estimator.Estimator)
+try:
+  tf.estimator.Estimator = doc_controls.inheritable_header(textwrap.dedent("""\
+    Warning: TensorFlow 2.15 included the final release of the `tf-estimator` 
+    package. Estimators will not be available in TensorFlow 2.16 or after. See the
+    [migration guide](https://www.tensorflow.org/guide/migrate/migrating_estimator)
+    for more information about how to convert off of Estimators."
+    """))(tf.estimator.Estimator)
+except AttributeError:
+  pass
 
 
 class RawOpsPageInfo(module_page.ModulePageInfo):
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 4e6cbfdecb501c..c78d03f82b52fd 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -2,9 +2,9 @@
 # This includes the C API, Java API, and protocol buffer files.
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
 load("@rules_pkg//:pkg.bzl", "pkg_tar", "pkg_zip")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_macos")
-load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 
 package(default_visibility = ["//visibility:private"])
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 521f492d46f04f..282b7c1229146c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -3,8 +3,8 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
-load("//tensorflow:tensorflow.bzl", "if_with_tpu_support", "transitive_hdrs")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl", "if_mkl_ml")
+load("//tensorflow:tensorflow.bzl", "if_with_tpu_support", "transitive_hdrs")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//tensorflow/tools/pip_package/utils:data_deps.bzl", "collect_data_files")
 load("//tensorflow/tools/pip_package/utils:py_deps.bzl", "transitive_py_deps")
diff --git a/tensorflow/tools/pip_package/build_pip_package.py b/tensorflow/tools/pip_package/build_pip_package.py
index 88f27b25cc5329..9ac5ad2d7b57c8 100644
--- a/tensorflow/tools/pip_package/build_pip_package.py
+++ b/tensorflow/tools/pip_package/build_pip_package.py
@@ -219,14 +219,21 @@ def patch_so(srcs_dir: str) -> None:
     srcs_dir: target directory with .so files to patch.
   """
   to_patch = {
-      "tensorflow/python/_pywrap_tensorflow_internal.so":
-      "$ORIGIN/../../tensorflow/tsl/python/lib/core",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/python/"
-       "pywrap_function_lib.so"): "$ORIGIN/../../../../../python",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/python/"
-       "pywrap_quantize_model.so"): "$ORIGIN/../../../../../python",
-      ("tensorflow/compiler/mlir/quantization/tensorflow/calibrator/"
-       "pywrap_calibration.so"): "$ORIGIN/../../../../../python",
+      "tensorflow/python/_pywrap_tensorflow_internal.so": (
+          "$ORIGIN/../../tensorflow/compiler/xla/tsl/python/lib/core"
+      ),
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/python/"
+          "pywrap_function_lib.so"
+      ): "$ORIGIN/../../../../../python",
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/python/"
+          "pywrap_quantize_model.so"
+      ): "$ORIGIN/../../../../../python",
+      (
+          "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/"
+          "pywrap_calibration.so"
+      ): "$ORIGIN/../../../../../python",
   }
   for file, path in to_patch.items():
     rpath = subprocess.check_output(
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
deleted file mode 100755
index ff21aadba95be6..00000000000000
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ /dev/null
@@ -1,485 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-
-set -e
-
-# Read the value of VERSION from vercod.bzl
-VERSION=$(grep 'VERSION = ' tensorflow/tensorflow.bzl | sed -E 's/VERSION = "(.*)"/\1/g')
-VERSION_MAJOR=$(echo "$VERSION" | cut -d '.' -f1)
-echo TensorFlow Version: ${VERSION}
-echo TensorFlow Major Version: ${VERSION_MAJOR}
-
-function is_absolute {
-  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
-}
-
-function real_path() {
-  is_absolute "$1" && echo "$1" || echo "$PWD/${1#./}"
-}
-
-function cp_external() {
-  local src_dir=$1
-  local dest_dir=$2
-
-  pushd .
-  cd "$src_dir"
-  for f in `find . ! -type d ! -name '*.py' ! -path '*local_config_cuda*' ! -path '*local_config_tensorrt*' ! -path '*pypi*' ! -path '*python_x86_64*' ! -path '*python_aarch64*' ! -path '*local_config_syslibs*' ! -path '*org_tensorflow*' ! -path '*llvm-project/llvm/*' ! -path '*local_tsl*' ! -path '*local_xla*'`; do
-    mkdir -p "${dest_dir}/$(dirname ${f})"
-    cp "${f}" "${dest_dir}/$(dirname ${f})/"
-  done
-  popd
-
-  mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
-  cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
-}
-
-function cp_local_config_python() {
-  local src_dir=$1
-  local dest_dir=$2
-  pushd .
-  cd "$src_dir"
-  mkdir -p "${dest_dir}/local_config_python/numpy_include/"
-  cp -r "pypi_numpy/site-packages/numpy/core/include/numpy" "${dest_dir}/local_config_python/numpy_include/"
-  mkdir -p "${dest_dir}/local_config_python/python_include/"
-  if is_windows; then
-    cp -r python_*/include/* "${dest_dir}/local_config_python/python_include/"
-  else
-    cp -r python_*/include/python*/* "${dest_dir}/local_config_python/python_include/"
-  fi
-  popd
-}
-
-function copy_xla_aot_runtime_sources() {
-  local src_dir=$1
-  local dst_dir=$2
-
-  local srcs_txt="tensorflow/tools/pip_package/xla_compiled_cpu_runtime_srcs.txt"
-
-  if [ ! -f "${src_dir}/${srcs_txt}" ]; then
-    echo Could not find source list file "${src_dir}/${srcs_txt}". 1>&2
-    return 0
-  fi
-
-  pushd $src_dir
-  for file in $(cat "${srcs_txt}")
-  do
-    # Sometimes $file has a prefix bazel-out/host/ we want to remove.
-    prefix=${file%%tensorflow/*}  # Find the location of "tensorflow/*"
-    candidate_file=${file#$prefix}  # Remove the prefix
-    if [ ! -z "$candidate_file" ]; then
-      file=$candidate_file
-    fi
-
-    # For XLA/TSL, we need to remove the prefix "../local_{xla|tsl}/".
-    dst_file=$file
-    dst_file=${dst_file#"../local_xla/"}
-    dst_file=${dst_file#"../local_tsl/"}
-
-    if test -f "$file"; then
-      mkdir -p "${dst_dir}/$(dirname $dst_file)"
-      cp $file "${dst_dir}/${dst_file}"
-    else
-      echo "Missing xla source file: ${file}" 1>&2
-    fi
-  done
-  cp tensorflow/tools/pip_package/xla_build/CMakeLists.txt "${dst_dir}"
-  popd
-}
-
-function move_to_root_if_exists () {
-  arg_to_move="$1"
-  if [ -e "${arg_to_move}" ]; then
-    mv ${arg_to_move} ./
-  fi
-}
-
-function reorganize_includes() {
-  TMPDIR="${1%/}"
-  pushd "${TMPDIR}/tensorflow/include/"
-
-  move_to_root_if_exists external/com_google_absl/absl
-
-  move_to_root_if_exists external/eigen_archive/Eigen
-  move_to_root_if_exists external/eigen_archive/unsupported
-
-  move_to_root_if_exists external/jsoncpp_git/include
-  rm -rf external/jsoncpp_git
-
-  move_to_root_if_exists external/com_google_protobuf/src/google
-  rm -rf external/com_google_protobuf/python
-
-  cp -R external/ml_dtypes ./
-
-  popd
-}
-
-PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
-function is_windows() {
-  if [[ "${PLATFORM}" =~ (cygwin|mingw32|mingw64|msys)_nt* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function is_macos() {
-  if [[ "${PLATFORM}" =~ darwin* ]]; then
-    true
-  else
-    false
-  fi
-}
-
-function prepare_src() {
-  if [ $# -lt 1 ] ; then
-    echo "No destination dir provided"
-    exit 1
-  fi
-
-  TMPDIR="${1%/}"
-  mkdir -p "$TMPDIR"
-  echo TMPDIR: ${TMPDIR}
-  EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
-  XLA_AOT_RUNTIME_SOURCES="${TMPDIR}/tensorflow/xla_aot_runtime_src"
-
-  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
-
-  if [ ! -d bazel-bin/tensorflow ]; then
-    echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
-    exit 1
-  fi
-
-  if is_windows; then
-    cp -L \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/LICENSE \
-      "${TMPDIR}"
-
-    # Change the format of file path (TMPDIR-->TMPDIR_rsync) which is input to the rsync from
-    # Windows-compatible to Linux-compatible to resolve the error below
-    # error: ssh: Could not resolve hostname c: No such host is known.
-
-    TMPDIR_rsync=`cygpath $TMPDIR`
-    rsync -a \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow/tensorflow \
-      "${TMPDIR_rsync}"
-    cp_external \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles \
-      "${EXTERNAL_INCLUDES}/"
-    cp_local_config_python \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles \
-      "${EXTERNAL_INCLUDES}/"
-    copy_xla_aot_runtime_sources \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow \
-      "${XLA_AOT_RUNTIME_SOURCES}/"
-    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/org_tensorflow
-    # If oneDNN was built with openMP then copy the omp libs over
-    if [ -f "bazel-bin/external/llvm_openmp/libiomp5md.dll" ]; then
-      cp bazel-bin/external/llvm_openmp/libiomp5md.dll ${TMPDIR}/tensorflow/python
-      cp bazel-bin/external/llvm_openmp/libiomp5md.dll.if.lib ${TMPDIR}/tensorflow/python
-    fi
-  else
-    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
-    # Resolved the issue of a missing symlink to libtensorflow_cc.so.2 b/264967822#comment25
-    if is_macos; then
-      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib" ]; then
-        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib" \
-         "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-        echo "Created symlink: $(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib -> \
-          ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-      else
-        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
-      fi
-    else
-      # cp -P ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION} ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}
-      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}" ]; then
-        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION}")")/libtensorflow_cc.so.${VERSION_MAJOR}" \
-          "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
-      else
-        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
-      fi
-    fi
-    cp -L \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/LICENSE \
-      "${TMPDIR}"
-    cp -LR \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
-      "${TMPDIR}"
-    # Prevents pip package bloat. See b/228948031#comment17.
-    rm -f ${TMPDIR}/tensorflow/python/lib_pywrap_tensorflow_internal.*
-    if [ -d bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external ]; then
-      # Old-style runfiles structure (--legacy_external_runfiles).
-      cp_external \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
-        "${EXTERNAL_INCLUDES}"
-      cp_local_config_python \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
-        "${EXTERNAL_INCLUDES}"
-    else
-      # New-style runfiles structure (--nolegacy_external_runfiles).
-      cp_external \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
-        "${EXTERNAL_INCLUDES}"
-      cp_local_config_python \
-        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
-        "${EXTERNAL_INCLUDES}"
-    fi
-    copy_xla_aot_runtime_sources \
-      bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow \
-      "${XLA_AOT_RUNTIME_SOURCES}"
-    # Copy MKL libs over so they can be loaded at runtime
-    so_lib_dir=$(ls $RUNFILES | grep solib)
-    if is_macos; then
-      chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-    else
-      chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      chmod +rw ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so):\$ORIGIN/../../tensorflow/tsl/python/lib/core ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      patchelf --set-rpath $(patchelf --print-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so):\$ORIGIN/../../../../../python ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_function_lib.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/python/pywrap_quantize_model.so
-      patchelf --shrink-rpath ${TMPDIR}/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/pywrap_calibration.so
-    fi
-    mkl_so_dir=$(ls ${RUNFILES}/${so_lib_dir} | grep mkl) || true
-    if [ -n "${mkl_so_dir}" ]; then
-      mkdir "${TMPDIR}/${so_lib_dir}"
-      cp -R ${RUNFILES}/${so_lib_dir}/${mkl_so_dir} "${TMPDIR}/${so_lib_dir}"
-    fi
-  fi
-
-  # Move vendored files into proper locations
-  # This is required because TSL/XLA don't publish their own wheels
-  # We copy from bazel-bin/tensorflow instead of bazel-bin/internal to copy
-  # headers from TSL/XLA into tensorflow so that InstallHeaders can move
-  # them back into tensorflow/include
-  if is_windows; then
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_tsl/tsl/ ${TMPDIR}/tensorflow
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.exe.runfiles/local_xla/xla/ ${TMPDIR}/tensorflow/compiler
-  else
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_tsl/tsl ${TMPDIR}/tensorflow
-    cp -RLn bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/local_xla/xla ${TMPDIR}/tensorflow/compiler
-  fi
-  # Fix the proto stubs
-  if is_macos; then
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from tsl\./from tensorflow.tsl./' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from local_xla\.xla/from tensorflow.compiler.xla/' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i '' 's/from xla/from tensorflow.compiler.xla/' {} \;
-  else
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from tsl\./from tensorflow.tsl./' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from local_xla\.xla/from tensorflow.compiler.xla/' {} \;
-    find ${TMPDIR}/tensorflow/ -name "*.py" -type f -exec sed -i'' 's/from xla/from tensorflow.compiler.xla/' {} \;
-  fi
-
-  mkdir -p ${TMPDIR}/third_party
-  cp -LR $RUNFILES/../local_config_cuda/cuda/_virtual_includes/cuda_headers_virtual/third_party/gpus ${TMPDIR}/third_party
-  cp $RUNFILES/tensorflow/tools/pip_package/THIRD_PARTY_NOTICES.txt "${TMPDIR}/tensorflow"
-
-  reorganize_includes "${TMPDIR}"
-
-  cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
-  cp tensorflow/tools/pip_package/README ${TMPDIR}/README.md
-  cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
-
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.so.[0-9].*
-
-  # Copying symlinks with -L duplicates these libraries.
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_framework.[0-9].*.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_cc.dylib
-  rm -f ${TMPDIR}/tensorflow/libtensorflow_cc.[0-9].*.dylib
-
-  # TODO(annarev): copy over API files from tensorflow/api/_vN to tensorflow/
-  #   except tensorflow/api/_vN/lite/.
-
-  # TODO(b/150440817): support autocomplete for tf.keras
-  # Copy over keras API folder to the root directory
-  # so that autocomplete works as expected for all keras subimports.
-  # if [ -d "${TMPDIR}/tensorflow/_api/v1/" ]
-  # then
-  #   cp -r ${TMPDIR}/tensorflow/python/keras/api/_v1/keras/ ${TMPDIR}/tensorflow/keras/
-  #   sed -i'.original' -e 's/.python.keras.api._v1/tensorflow/g' ${TMPDIR}/tensorflow/__init__.py
-  # else
-  #   cp -r ${TMPDIR}/tensorflow/python/keras/api/_v2/keras/ ${TMPDIR}/tensorflow/keras/
-  #   sed -i'.original' -e 's/.python.keras.api._v2/tensorflow/g' ${TMPDIR}/tensorflow/__init__.py
-  # fi
-}
-
-function build_wheel() {
-  if [ $# -lt 2 ] ; then
-    echo "No src and dest dir provided"
-    exit 1
-  fi
-
-  TMPDIR="$1"
-  DEST="$2"
-  PKG_NAME_FLAG="$3"
-
-  # Before we leave the top-level directory, make sure we know how to
-  # call python.
-  if [[ -e tools/python_bin_path.sh ]]; then
-    source tools/python_bin_path.sh
-  fi
-  if is_windows; then
-    PY_DIR=$(find -L ./bazel-tensorflow/external -maxdepth 1 -type d -name "python_*")
-    FULL_DIR="$(real_path "$PY_DIR")/python"
-    export PYTHONPATH="$PYTHONPATH:$PWD/bazel-tensorflow/external/pypi_wheel/site-packages/"
-  else
-    PY_DIR=$(find ./bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/ -maxdepth 1 -type d -name "python_*")
-    FULL_DIR="$(real_path "$PY_DIR")/bin/python3"
-    export PYTHONPATH="$PYTHONPATH:$PWD/bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/pypi_wheel/site-packages/"
-  fi
-
-  pushd ${TMPDIR} > /dev/null
-
-  rm -f MANIFEST
-  echo $(date) : "=== Building wheel"
-  $FULL_DIR setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
-  mkdir -p ${DEST}
-  cp dist/* ${DEST}
-  popd > /dev/null
-  echo $(date) : "=== Output wheel file is in: ${DEST}"
-}
-
-function usage() {
-  echo "Usage:"
-  echo "$0 [--src srcdir] [--dst dstdir] [options]"
-  echo "$0 dstdir [options]"
-  echo ""
-  echo "    --src                 prepare sources in srcdir"
-  echo "                              will use temporary dir if not specified"
-  echo ""
-  echo "    --dst                 build wheel in dstdir"
-  echo "                              if dstdir is not set do not build, only prepare sources"
-  echo ""
-  echo "  Options:"
-  echo "    --project_name <name> set project name to name"
-  echo "    --cpu                 build tensorflow_cpu"
-  echo "    --tpu                 build tensorflow_tpu"
-  echo "    --gpudirect           build tensorflow_gpudirect"
-  echo "    --rocm                build tensorflow_rocm"
-  echo "    --nightly_flag        build tensorflow nightly"
-  echo ""
-  exit 1
-}
-
-function main() {
-  PKG_NAME_FLAG=""
-  PROJECT_NAME=""
-  CPU_BUILD=0
-  TPU_BUILD=0
-  GPUDIRECT_BUILD=0
-  ROCM_BUILD=0
-  NIGHTLY_BUILD=0
-  SRCDIR=""
-  DSTDIR=""
-  CLEANSRC=1
-  while true; do
-    if [[ "$1" == "--help" ]]; then
-      usage
-      exit 1
-    elif [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--cpu" ]]; then
-      CPU_BUILD=1
-    elif [[ "$1" == "--tpu" ]]; then
-      TPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      GPUDIRECT_BUILD=1
-    elif [[ "$1" == "--rocm" ]]; then
-      ROCM_BUILD=1
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    elif [[ "$1" == "--src" ]]; then
-      shift
-      SRCDIR="$(real_path $1)"
-      CLEANSRC=0
-    elif [[ "$1" == "--dst" ]]; then
-      shift
-      DSTDIR="$(real_path $1)"
-    else
-      DSTDIR="$(real_path $1)"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ $(( TPU_BUILD + CPU_BUILD + GPUDIRECT_BUILD + ROCM_BUILD )) -gt "1" ]]; then
-    echo "Only one of [--tpu, --cpu, --gpudirect, --rocm] may be provided."
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
-    echo "No destination dir provided"
-    usage
-    exit 1
-  fi
-
-  if [[ -z "$SRCDIR" ]]; then
-    # make temp srcdir if none set
-    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
-  fi
-
-  prepare_src "$SRCDIR"
-
-  if [[ -z "$DSTDIR" ]]; then
-      # only want to prepare sources
-      exit
-  fi
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPUDIRECT_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpudirect"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${ROCM_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_rocm"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${CPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_cpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${TPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_tpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPUDIRECT_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-  elif [[ ${ROCM_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_rocm"
-  elif [[ ${CPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_cpu"
-  elif [[ ${TPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_tpu"
-  fi
-
-  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
-
-  if [[ $CLEANSRC -ne 0 ]]; then
-    rm -rf "${TMPDIR}"
-  fi
-}
-
-main "$@"
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 161509369633e4..0a53c467a2fe58 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -115,7 +115,7 @@ def standard_or_nightly(standard, nightly):
     # or final). For example, 'keras-nightly ~= 2.14.0.dev' will be replaced by
     # 'keras >= 2.14.0rc0, < 2.15' on the release branch after the branch cut.
     'tb-nightly ~= 2.17.0.a',
-    'keras-nightly ~= 3.1.0.dev',
+    'keras-nightly >= 3.2.0.dev',
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
 
diff --git a/tensorflow/tools/proto_splitter/testdata/BUILD b/tensorflow/tools/proto_splitter/testdata/BUILD
index f3d817b9ae283b..5871675f30c856 100644
--- a/tensorflow/tools/proto_splitter/testdata/BUILD
+++ b/tensorflow/tools/proto_splitter/testdata/BUILD
@@ -1,5 +1,6 @@
-# Placeholder: load py_proto_library
 load("//tensorflow:strict.default.bzl", "py_strict_binary")
+
+# Placeholder: load py_proto_library
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 9261a652f9c367..00cd6983ca3835 100644
--- a/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
 load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
index 9f71a414bf7746..ae776c2a2fd388 100644
--- a/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+++ b/tensorflow/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
-load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
-load("//tensorflow/tools/toolchains/remote_config:containers.bzl", "containers")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/tensorflow/tools/toolchains/win/bazel_211/BUILD b/tensorflow/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/tensorflow/tools/toolchains/win/bazel_211/BUILD
+++ b/tensorflow/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/tensorflow/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/tensorflow/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index d8b53e1ef8b3be..0fdb0342fae676 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -1,14 +1,14 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@local_config_android//:android.bzl", "android_workspace")
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index 9b092a10bf3310..812baccbbbe851 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party/android:android_configure.bzl", "android_configure")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party/android:android_configure.bzl", "android_configure")
 
 # buildifier: disable=unnamed-macro
 def workspace(with_rules_cc = True):
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 730882498221ec..d89168256105a7 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -8,6 +8,13 @@ load("@bazel_skylib//lib:versions.bzl", "versions")
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
+load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
+load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
+load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
+load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
+load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
+load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
@@ -45,13 +52,6 @@ load("//third_party/ruy:workspace.bzl", ruy = "repo")
 load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
-load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
-load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
-load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
-load("//tensorflow/tools/toolchains/clang6:repo.bzl", "clang6_configure")
-load("//tensorflow/tools/toolchains/embedded/arm-linux:arm_linux_toolchain_configure.bzl", "arm_linux_toolchain_configure")
-load("//tensorflow/tools/toolchains/remote:configure.bzl", "remote_execution_configure")
-load("//tensorflow/tools/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
 load("//third_party/tensorrt:workspace.bzl", tensorrt = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "eba4410af9f455f484b425a80196ff3c3b8cf6f650616a281a815a3bd45bae5f",
-        strip_prefix = "XNNPACK-db84c8c822a70183e10ef6b7402c6730f5d54da5",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/db84c8c822a70183e10ef6b7402c6730f5d54da5.zip"),
+        sha256 = "55827bd7c39a080d4296e84dfbe576240d83a3347df8ad4e10cf9fe400678db7",
+        strip_prefix = "XNNPACK-65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/65ca94a2d3f39e7ff8e4045852b32b65ca6a16b7.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -181,9 +181,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
-        strip_prefix = "cudnn-frontend-1.1.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
+        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
+        strip_prefix = "cudnn-frontend-1.2.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
     )
 
     tf_http_archive(
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index 7d187724bb1e47..d7b32f01c7144d 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
-load("//third_party/llvm:workspace.bzl", llvm = "repo")
 load("//third_party:repo.bzl", "tf_vendored")
+load("//third_party/llvm:workspace.bzl", llvm = "repo")
+load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     tf_vendored(name = "local_xla", relpath = "third_party/xla")
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index ef738212657fdd..2b78d1e03ba47c 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -1,4 +1,5 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
 load(":build_defs.bzl", "flatbuffer_py_strip_prefix_srcs")
 
 package(default_visibility = ["//visibility:public"])
@@ -22,8 +23,6 @@ config_setting(
     values = {"cpu": "x64_windows"},
 )
 
-load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
-
 # Public flatc library to compile flatbuffer files at runtime.
 cc_library(
     name = "flatbuffers",
diff --git a/third_party/googleapis/build_rules.bzl b/third_party/googleapis/build_rules.bzl
index 377d74be1adaf2..3715b85da29339 100644
--- a/third_party/googleapis/build_rules.bzl
+++ b/third_party/googleapis/build_rules.bzl
@@ -16,8 +16,8 @@
 Utilities for building grpc and proto libraries from googleapis.
 """
 
-load("@rules_cc//cc:defs.bzl", native_cc_proto_library = "cc_proto_library")
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
+load("@rules_cc//cc:defs.bzl", native_cc_proto_library = "cc_proto_library")
 
 def _tf_cc_headers(ctx):
     if len(ctx.attr.deps) != 1:
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 5bf0504dc91bcc..03a373a0024e3e 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -26,7 +26,6 @@
   * `PYTHON_BIN_PATH`: The python binary path
 """
 
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
     "escape_string",
@@ -38,6 +37,7 @@ load(
     "find_vc_path",
     "setup_vc_env_vars",
 )
+load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index bfd5829fc25ab2..fadc6c6e82dbd3 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -8,13 +8,6 @@
   * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
-load(
-    ":cuda_configure.bzl",
-    "enable_cuda",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-    "to_list_of_strings",
-)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
@@ -29,6 +22,13 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":cuda_configure.bzl",
+    "enable_cuda",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
 
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD
index a8ed9db0883d41..ff14af8b59e21d 100644
--- a/third_party/hwloc/hwloc.BUILD
+++ b/third_party/hwloc/hwloc.BUILD
@@ -1,5 +1,8 @@
 # hwloc: Portable Hardware Locality Library
 
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
 package(
     default_visibility = ["//visibility:public"],
 )
@@ -8,9 +11,6 @@ licenses(["notice"])
 
 exports_files(["COPYING"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
 COMMON_INCLUDE_COPTS = [
     "-I.",
     "-Ihwloc",
diff --git a/third_party/implib_so/workspace.bzl b/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/implib_so/workspace.bzl
+++ b/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 9f61f9e31e5e12..9afb8ba2165651 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
 
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index b75801c374943b..509398da979e83 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,183 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
---- a/llvm/include/llvm/IR/AutoUpgrade.h
-+++ b/llvm/include/llvm/IR/AutoUpgrade.h
-@@ -88,9 +88,6 @@
-   /// info. Return true if module is modified.
-   bool UpgradeDebugInfo(Module &M);
- 
--  /// Copies module attributes to the functions in the module.
--  void CopyModuleAttrToFunctions(Module &M);
--
-   /// Check whether a string looks like an old loop attachment tag.
-   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
-     return Name.starts_with("llvm.vectorizer.");
-diff -ruN --strip-trailing-cr a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
---- a/llvm/lib/IR/AutoUpgrade.cpp
-+++ b/llvm/lib/IR/AutoUpgrade.cpp
-@@ -5178,72 +5178,6 @@
-     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
- }
- 
--// Check if the module attribute is present and not zero.
--static bool isModuleAttributeSet(Module &M, const StringRef &ModAttr) {
--  const auto *Attr =
--      mdconst::extract_or_null<ConstantInt>(M.getModuleFlag(ModAttr));
--  return Attr && Attr->getZExtValue();
--}
--
--// Copy an attribute from module to the function if exists.
--// First value of the pair is used when the module attribute is not zero
--// the second otherwise.
--static void
--CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
--                              StringRef ModAttrName,
--                              std::pair<StringRef, StringRef> Values) {
--  if (F.hasFnAttribute(FnAttrName))
--    return;
--  F.addFnAttr(FnAttrName, isModuleAttributeSet(*F.getParent(), ModAttrName)
--                              ? Values.first
--                              : Values.second);
--}
--
--// Copy a boolean attribute from module to the function if exists.
--// Module attribute treated false if zero otherwise true.
--static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
--  CopyModuleAttributeToFunction(
--      F, AttrName, AttrName,
--      std::make_pair<StringRef, StringRef>("true", "false"));
--}
--
--// Copy an attribute from module to the function if exists.
--// First value of the pair is used when the module attribute is not zero
--// the second otherwise.
--static void
--CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
--                              std::pair<StringRef, StringRef> Values) {
--  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
--}
--
--void llvm::CopyModuleAttrToFunctions(Module &M) {
--  Triple T(M.getTargetTriple());
--  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
--    return;
--
--  for (Function &F : M.getFunctionList()) {
--    if (F.isDeclaration())
--      continue;
--
--    if (!F.hasFnAttribute("sign-return-address")) {
--      StringRef SignType = "none";
--      if (isModuleAttributeSet(M, "sign-return-address"))
--        SignType = "non-leaf";
--
--      if (isModuleAttributeSet(M, "sign-return-address-all"))
--        SignType = "all";
--
--      F.addFnAttr("sign-return-address", SignType);
--    }
--    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
--    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
--    CopyModuleAttributeToFunction(F, "guarded-control-stack");
--    CopyModuleAttributeToFunction(
--        F, "sign-return-address-key",
--        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
--  }
--}
--
- static bool isOldLoopArgument(Metadata *MD) {
-   auto *T = dyn_cast_or_null<MDTuple>(MD);
-   if (!T)
-diff -ruN --strip-trailing-cr a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
---- a/llvm/lib/Linker/IRMover.cpp
-+++ b/llvm/lib/Linker/IRMover.cpp
-@@ -1606,11 +1606,6 @@
-   // Loop over all of the linked values to compute type mappings.
-   computeTypeMapping();
- 
--  // Convert module level attributes to function level attributes because
--  // after merging modules the attributes might change and would have different
--  // effect on the functions as the original module would have.
--  CopyModuleAttrToFunctions(*SrcM);
--
-   std::reverse(Worklist.begin(), Worklist.end());
-   while (!Worklist.empty()) {
-     GlobalValue *GV = Worklist.back();
-diff -ruN --strip-trailing-cr a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
---- a/llvm/test/Linker/link-arm-and-thumb.ll
-+++ b/llvm/test/Linker/link-arm-and-thumb.ll
-@@ -13,12 +13,11 @@
-   ret i32 %add
- }
- 
--; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
-+; CHECK: define i32 @main() {
- ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
- ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
- 
--; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
--; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
--; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
-+; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
-+; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
- 
- ; STDERR-NOT: warning: Linking two modules of different target triples:
-diff -ruN --strip-trailing-cr a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
---- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
-+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
-@@ -32,7 +32,6 @@
- ; CHECK-DUMP: <main>:
- ; CHECK-DUMP:      bl      0x8 <main+0x8>
- ; CHECK-DUMP: <foo>:
--; CHECK-DUMP:     paciasp
- 
- ; `main` doesn't support BTI while `foo` does, so in the binary
- ; we should see only PAC which is supported by both.
-diff -ruN --strip-trailing-cr a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
---- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
-+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
-@@ -1,43 +0,0 @@
--; Testcase to check that module with different branch-target-enforcement can
--; be mixed.
--;
--; RUN: llvm-as %s -o %t1.bc
--; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
--; RUN: llvm-lto -exported-symbol main \
--; RUN:          -exported-symbol foo \
--; RUN:          -filetype=obj \
--; RUN:           %t2.bc %t1.bc \
--; RUN:           -o %t1.exe 2>&1
--; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
--; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
--
--target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
--target triple = "aarch64-unknown-linux-gnu"
--
--declare i32 @foo();
--
--define i32 @main() {
--entry:
--  %add = call i32 @foo()
--  ret i32 %add
--}
--
--!llvm.module.flags = !{!0, !1, !2, !3 }
--!0 = !{i32 8, !"branch-target-enforcement", i32 0}
--!1 = !{i32 8, !"sign-return-address", i32 0}
--!2 = !{i32 8, !"sign-return-address-all", i32 0}
--!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
--
--; CHECK-DUMP: <foo>:
--; CHECK-DUMP:     paciasp
--; CHECK-DUMP:     mov     w0, #0x2a
--; CHECK-DUMP:     autiasp
--; CHECK-DUMP:     ret
--; CHECK-DUMP: <main>:
--; CHECK-DUMP-NOT:  paciasp
--; CHECK-DUMP:      str     x30,
--; CHECK-DUMP:      bl      0x14 <main+0x4>
--
--; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
--; we should not see anything.
--; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
-\ No newline at end of file
diff --git a/third_party/llvm/vs2019.patch b/third_party/llvm/vs2019.patch
new file mode 100644
index 00000000000000..d6eff968e5f832
--- /dev/null
+++ b/third_party/llvm/vs2019.patch
@@ -0,0 +1,394 @@
+diff --git a/llvm/include/llvm/Support/FormatAdapters.h b/llvm/include/llvm/Support/FormatAdapters.h
+index 495205d11748..4131e956873e 100644
+--- a/llvm/include/llvm/Support/FormatAdapters.h
++++ b/llvm/include/llvm/Support/FormatAdapters.h
+@@ -16,13 +16,15 @@
+ #include "llvm/Support/raw_ostream.h"
+ 
+ namespace llvm {
+-template <typename T> class FormatAdapter : public detail::format_adapter {
++template <typename T>
++class FormatAdapter : public support::detail::format_adapter {
+ protected:
+   explicit FormatAdapter(T &&Item) : Item(std::forward<T>(Item)) {}
+ 
+   T Item;
+ };
+ 
++namespace support {
+ namespace detail {
+ template <typename T> class AlignAdapter final : public FormatAdapter<T> {
+   AlignStyle Where;
+@@ -80,29 +82,31 @@ public:
+     Stream << Item;
+   }
+ };
+-}
++} // namespace detail
++} // namespace support
+ 
+ template <typename T>
+-detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where, size_t Amount,
+-                                  char Fill = ' ') {
+-  return detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount, Fill);
++support::detail::AlignAdapter<T> fmt_align(T &&Item, AlignStyle Where,
++                                           size_t Amount, char Fill = ' ') {
++  return support::detail::AlignAdapter<T>(std::forward<T>(Item), Where, Amount,
++                                          Fill);
+ }
+ 
+ template <typename T>
+-detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
+-  return detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
++support::detail::PadAdapter<T> fmt_pad(T &&Item, size_t Left, size_t Right) {
++  return support::detail::PadAdapter<T>(std::forward<T>(Item), Left, Right);
+ }
+ 
+ template <typename T>
+-detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
+-  return detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
++support::detail::RepeatAdapter<T> fmt_repeat(T &&Item, size_t Count) {
++  return support::detail::RepeatAdapter<T>(std::forward<T>(Item), Count);
+ }
+ 
+ // llvm::Error values must be consumed before being destroyed.
+ // Wrapping an error in fmt_consume explicitly indicates that the formatv_object
+ // should take ownership and consume it.
+-inline detail::ErrorAdapter fmt_consume(Error &&Item) {
+-  return detail::ErrorAdapter(std::move(Item));
++inline support::detail::ErrorAdapter fmt_consume(Error &&Item) {
++  return support::detail::ErrorAdapter(std::move(Item));
+ }
+ }
+ 
+diff --git a/llvm/include/llvm/Support/FormatCommon.h b/llvm/include/llvm/Support/FormatCommon.h
+index 24a40c325e13..326e00936aa7 100644
+--- a/llvm/include/llvm/Support/FormatCommon.h
++++ b/llvm/include/llvm/Support/FormatCommon.h
+@@ -17,13 +17,13 @@ namespace llvm {
+ enum class AlignStyle { Left, Center, Right };
+ 
+ struct FmtAlign {
+-  detail::format_adapter &Adapter;
++  support::detail::format_adapter &Adapter;
+   AlignStyle Where;
+   size_t Amount;
+   char Fill;
+ 
+-  FmtAlign(detail::format_adapter &Adapter, AlignStyle Where, size_t Amount,
+-           char Fill = ' ')
++  FmtAlign(support::detail::format_adapter &Adapter, AlignStyle Where,
++           size_t Amount, char Fill = ' ')
+       : Adapter(Adapter), Where(Where), Amount(Amount), Fill(Fill) {}
+ 
+   void format(raw_ostream &S, StringRef Options) {
+diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
+index aa0773847161..bf489e2bfa07 100644
+--- a/llvm/include/llvm/Support/FormatProviders.h
++++ b/llvm/include/llvm/Support/FormatProviders.h
+@@ -25,6 +25,7 @@
+ #include <type_traits>
+ 
+ namespace llvm {
++namespace support {
+ namespace detail {
+ template <typename T>
+ struct use_integral_formatter
+@@ -98,7 +99,8 @@ protected:
+     return Default;
+   }
+ };
+-}
++} // namespace detail
++} // namespace support
+ 
+ /// Implementation of format_provider<T> for integral arithmetic types.
+ ///
+@@ -125,8 +127,8 @@ protected:
+ 
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_integral_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++    T, std::enable_if_t<support::detail::use_integral_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+ private:
+ public:
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+@@ -174,8 +176,8 @@ public:
+ /// cases indicates the minimum number of nibbles to print.
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_pointer_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++    T, std::enable_if_t<support::detail::use_pointer_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+ private:
+ public:
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+@@ -199,7 +201,7 @@ public:
+ 
+ template <typename T>
+ struct format_provider<
+-    T, std::enable_if_t<detail::use_string_formatter<T>::value>> {
++    T, std::enable_if_t<support::detail::use_string_formatter<T>::value>> {
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+     size_t N = StringRef::npos;
+     if (!Style.empty() && Style.getAsInteger(10, N)) {
+@@ -231,8 +233,8 @@ template <> struct format_provider<Twine> {
+ /// character.  Otherwise, it is treated as an integer options string.
+ ///
+ template <typename T>
+-struct format_provider<T,
+-                       std::enable_if_t<detail::use_char_formatter<T>::value>> {
++struct format_provider<
++    T, std::enable_if_t<support::detail::use_char_formatter<T>::value>> {
+   static void format(const char &V, llvm::raw_ostream &Stream,
+                      StringRef Style) {
+     if (Style.empty())
+@@ -297,9 +299,9 @@ template <> struct format_provider<bool> {
+ /// else.
+ 
+ template <typename T>
+-struct format_provider<T,
+-                       std::enable_if_t<detail::use_double_formatter<T>::value>>
+-    : public detail::HelperFunctions {
++struct format_provider<
++    T, std::enable_if_t<support::detail::use_double_formatter<T>::value>>
++    : public support::detail::HelperFunctions {
+   static void format(const T &V, llvm::raw_ostream &Stream, StringRef Style) {
+     FloatStyle S;
+     if (Style.consume_front("P") || Style.consume_front("p"))
+@@ -321,6 +323,7 @@ struct format_provider<T,
+   }
+ };
+ 
++namespace support {
+ namespace detail {
+ template <typename IterT>
+ using IterValue = typename std::iterator_traits<IterT>::value_type;
+@@ -328,8 +331,10 @@ using IterValue = typename std::iterator_traits<IterT>::value_type;
+ template <typename IterT>
+ struct range_item_has_provider
+     : public std::integral_constant<
+-          bool, !uses_missing_provider<IterValue<IterT>>::value> {};
+-}
++          bool,
++          !support::detail::uses_missing_provider<IterValue<IterT>>::value> {};
++} // namespace detail
++} // namespace support
+ 
+ /// Implementation of format_provider<T> for ranges.
+ ///
+@@ -393,7 +398,7 @@ template <typename IterT> class format_provider<llvm::iterator_range<IterT>> {
+   }
+ 
+ public:
+-  static_assert(detail::range_item_has_provider<IterT>::value,
++  static_assert(support::detail::range_item_has_provider<IterT>::value,
+                 "Range value_type does not have a format provider!");
+   static void format(const llvm::iterator_range<IterT> &V,
+                      llvm::raw_ostream &Stream, StringRef Style) {
+@@ -403,18 +408,18 @@ public:
+     auto Begin = V.begin();
+     auto End = V.end();
+     if (Begin != End) {
+-      auto Adapter = detail::build_format_adapter(*Begin);
++      auto Adapter = support::detail::build_format_adapter(*Begin);
+       Adapter.format(Stream, ArgStyle);
+       ++Begin;
+     }
+     while (Begin != End) {
+       Stream << Sep;
+-      auto Adapter = detail::build_format_adapter(*Begin);
++      auto Adapter = support::detail::build_format_adapter(*Begin);
+       Adapter.format(Stream, ArgStyle);
+       ++Begin;
+     }
+   }
+ };
+-}
++} // namespace llvm
+ 
+ #endif
+diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h
+index ddd80d89f1cd..595f2cf559a4 100644
+--- a/llvm/include/llvm/Support/FormatVariadic.h
++++ b/llvm/include/llvm/Support/FormatVariadic.h
+@@ -66,7 +66,7 @@ struct ReplacementItem {
+ class formatv_object_base {
+ protected:
+   StringRef Fmt;
+-  ArrayRef<detail::format_adapter *> Adapters;
++  ArrayRef<support::detail::format_adapter *> Adapters;
+ 
+   static bool consumeFieldLayout(StringRef &Spec, AlignStyle &Where,
+                                  size_t &Align, char &Pad);
+@@ -75,7 +75,7 @@ protected:
+   splitLiteralAndReplacement(StringRef Fmt);
+ 
+   formatv_object_base(StringRef Fmt,
+-                      ArrayRef<detail::format_adapter *> Adapters)
++                      ArrayRef<support::detail::format_adapter *> Adapters)
+       : Fmt(Fmt), Adapters(Adapters) {}
+ 
+   formatv_object_base(formatv_object_base const &rhs) = delete;
+@@ -130,7 +130,7 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
+   // of the parameters, we have to own the storage for the parameters here, and
+   // have the base class store type-erased pointers into this tuple.
+   Tuple Parameters;
+-  std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
++  std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
+       ParameterPointers;
+ 
+   // The parameters are stored in a std::tuple, which does not provide runtime
+@@ -142,8 +142,8 @@ template <typename Tuple> class formatv_object : public formatv_object_base {
+   // std::array<Base*>.
+   struct create_adapters {
+     template <typename... Ts>
+-    std::array<detail::format_adapter *, std::tuple_size<Tuple>::value>
+-    operator()(Ts &... Items) {
++    std::array<support::detail::format_adapter *, std::tuple_size<Tuple>::value>
++    operator()(Ts &...Items) {
+       return {{&Items...}};
+     }
+   };
+@@ -248,13 +248,14 @@ public:
+ // the details of what that is are undefined.
+ //
+ template <typename... Ts>
+-inline auto formatv(const char *Fmt, Ts &&... Vals) -> formatv_object<decltype(
+-    std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
+-  using ParamTuple = decltype(
+-      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
++inline auto formatv(const char *Fmt, Ts &&...Vals)
++    -> formatv_object<decltype(std::make_tuple(
++        support::detail::build_format_adapter(std::forward<Ts>(Vals))...))> {
++  using ParamTuple = decltype(std::make_tuple(
++      support::detail::build_format_adapter(std::forward<Ts>(Vals))...));
+   return formatv_object<ParamTuple>(
+-      Fmt,
+-      std::make_tuple(detail::build_format_adapter(std::forward<Ts>(Vals))...));
++      Fmt, std::make_tuple(support::detail::build_format_adapter(
++               std::forward<Ts>(Vals))...));
+ }
+ 
+ } // end namespace llvm
+diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
+index 068c327df396..a221fcadbd3c 100644
+--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
++++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
+@@ -19,6 +19,7 @@ namespace llvm {
+ template <typename T, typename Enable = void> struct format_provider {};
+ class Error;
+ 
++namespace support {
+ namespace detail {
+ class format_adapter {
+   virtual void anchor();
+@@ -156,7 +157,8 @@ std::enable_if_t<uses_missing_provider<T>::value, missing_format_adapter<T>>
+ build_format_adapter(T &&) {
+   return missing_format_adapter<T>();
+ }
+-}
+-}
++} // namespace detail
++} // namespace support
++} // namespace llvm
+ 
+ #endif
+diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
+index 91b4c38ade4f..3c07a80a00ae 100644
+--- a/llvm/lib/Support/FormatVariadic.cpp
++++ b/llvm/lib/Support/FormatVariadic.cpp
+@@ -152,4 +152,4 @@ formatv_object_base::parseFormatString(StringRef Fmt) {
+   return Replacements;
+ }
+ 
+-void detail::format_adapter::anchor() { }
++void support::detail::format_adapter::anchor() {}
+
+diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
+index 58c89aad7a85..a78b25c53d7e 100644
+--- a/llvm/unittests/Support/FormatVariadicTest.cpp
++++ b/llvm/unittests/Support/FormatVariadicTest.cpp
+@@ -20,8 +20,8 @@ struct Format : public FormatAdapter<int> {
+   void format(raw_ostream &OS, StringRef Opt) override { OS << "Format"; }
+ };
+ 
+-using detail::uses_format_member;
+-using detail::uses_missing_provider;
++using support::detail::uses_format_member;
++using support::detail::uses_missing_provider;
+ 
+ static_assert(uses_format_member<Format>::value, "");
+ static_assert(uses_format_member<Format &>::value, "");
+diff --git a/mlir/include/mlir/TableGen/Format.h b/mlir/include/mlir/TableGen/Format.h
+index 79d3d26a9d68..e92f6c64eab5 100644
+--- a/mlir/include/mlir/TableGen/Format.h
++++ b/mlir/include/mlir/TableGen/Format.h
+@@ -133,14 +133,15 @@ protected:
+   // std::vector<Base*>.
+   struct CreateAdapters {
+     template <typename... Ts>
+-    std::vector<llvm::detail::format_adapter *> operator()(Ts &...items) {
+-      return std::vector<llvm::detail::format_adapter *>{&items...};
++    std::vector<llvm::support::detail::format_adapter *>
++    operator()(Ts &...items) {
++      return std::vector<llvm::support::detail::format_adapter *>{&items...};
+     }
+   };
+ 
+   StringRef fmt;
+   const FmtContext *context;
+-  std::vector<llvm::detail::format_adapter *> adapters;
++  std::vector<llvm::support::detail::format_adapter *> adapters;
+   std::vector<FmtReplacement> replacements;
+ 
+ public:
+@@ -205,8 +206,8 @@ public:
+ 
+ class FmtStrVecObject : public FmtObjectBase {
+ public:
+-  using StrFormatAdapter =
+-      decltype(llvm::detail::build_format_adapter(std::declval<std::string>()));
++  using StrFormatAdapter = decltype(llvm::support::detail::build_format_adapter(
++      std::declval<std::string>()));
+ 
+   FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
+                   ArrayRef<std::string> params);
+@@ -259,14 +260,15 @@ private:
+ ///    in C++ code generation.
+ template <typename... Ts>
+ inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&...vals)
+-    -> FmtObject<decltype(std::make_tuple(
+-        llvm::detail::build_format_adapter(std::forward<Ts>(vals))...))> {
++    -> FmtObject<
++        decltype(std::make_tuple(llvm::support::detail::build_format_adapter(
++            std::forward<Ts>(vals))...))> {
+   using ParamTuple = decltype(std::make_tuple(
+-      llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
++      llvm::support::detail::build_format_adapter(std::forward<Ts>(vals))...));
+   return FmtObject<ParamTuple>(
+       fmt, ctx,
+-      std::make_tuple(
+-          llvm::detail::build_format_adapter(std::forward<Ts>(vals))...));
++      std::make_tuple(llvm::support::detail::build_format_adapter(
++          std::forward<Ts>(vals))...));
+ }
+ 
+ inline FmtStrVecObject tgfmt(StringRef fmt, const FmtContext *ctx,
+diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp
+index 03f888b139f8..65f4ad56dd15 100644
+--- a/mlir/lib/TableGen/Format.cpp
++++ b/mlir/lib/TableGen/Format.cpp
+@@ -203,7 +203,8 @@ FmtStrVecObject::FmtStrVecObject(StringRef fmt, const FmtContext *ctx,
+     : FmtObjectBase(fmt, ctx, params.size()) {
+   parameters.reserve(params.size());
+   for (std::string p : params)
+-    parameters.push_back(llvm::detail::build_format_adapter(std::move(p)));
++    parameters.push_back(
++        llvm::support::detail::build_format_adapter(std::move(p)));
+ 
+   adapters.reserve(parameters.size());
+   for (auto &p : parameters)
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 772469ed4698c1..93795513a18c66 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "3b5e7c83a6e226d5bd7ed2e9b67449b64812074c"
-    LLVM_SHA256 = "7fa7a38aade8b5fa2f7719cd3b6e2f038fed1b00d7369cdb05b490085de79c91"
+    LLVM_COMMIT = "c09b6fac12b0299841bf1bf04974712963736db5"
+    LLVM_SHA256 = "37b63fac6ac8b44c89b1c0b2ba3ed662adfc213dddac3221ebed2c7c0a60e606"
 
     tf_http_archive(
         name = name,
@@ -21,6 +21,7 @@ def repo(name):
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
+            "//third_party/llvm:vs2019.patch",
             "//third_party/llvm:zstd.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
index cd410257058ce1..0d1fad28f9c1ac 100644
--- a/third_party/llvm_openmp/BUILD
+++ b/third_party/llvm_openmp/BUILD
@@ -1,5 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -16,7 +17,6 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     default_visibility = [
diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD
index d8373b0eae0f34..619b9deea1fbf1 100644
--- a/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -1,7 +1,7 @@
-exports_files(["LICENSE"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
+exports_files(["LICENSE"])
+
 _DNNL_COPTS_THREADPOOL = [
     "-fopenmp-simd",
     "-fexceptions",
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 5494e718a9c168..370319c5b8ee6f 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
-load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@org_tensorflow//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/mpitrampoline/BUILD b/third_party/mpitrampoline/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/mpitrampoline/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/mpitrampoline/gen.patch b/third_party/mpitrampoline/gen.patch
new file mode 100644
index 00000000000000..35124db0abb1e3
--- /dev/null
+++ b/third_party/mpitrampoline/gen.patch
@@ -0,0 +1,149 @@
+diff --git a/gen/gen_decl.py b/gen/gen_decl.py
+index 1005b95..696b4e0 100755
+--- a/gen/gen_decl.py
++++ b/gen/gen_decl.py
+@@ -9,8 +9,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+@@ -24,7 +24,7 @@ def wrap(line):
+     lines.append(line)
+     return "\n".join(lines)
+ 
+-with open("include/mpi_decl_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Declare C MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -32,7 +32,7 @@ with open("include/mpi_decl_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("extern $mpi_tp MPITRAMPOLINE_CONST $mpi_nm;\n").substitute(subs))
+ 
+-with open("include/mpi_decl_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Declare C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -90,7 +90,7 @@ with open("include/mpi_decl_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_constants_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -104,7 +104,7 @@ with open("include/mpi_decl_constants_fortran.h", "w") as file:
+         file.write("\n".join(map(lambda line: wrap(Template(line).substitute(subs)), tmpl)))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_functions_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_defn.py b/gen/gen_defn.py
+index bf31f35..318222e 100755
+--- a/gen/gen_defn.py
++++ b/gen/gen_defn.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_defn_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Define C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -24,7 +24,7 @@ with open("src/mpi_defn_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("$mpi_tp $mpi_nm = ($mpi_tp)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Define C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -89,7 +89,7 @@ with open("src/mpi_defn_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("src/mpi_defn_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -98,7 +98,7 @@ with open("src/mpi_defn_constants_fortran.h", "w") as file:
+         # Fortran common blocks with `-march=skylake-avx512` are aligned to 64 bytes
+         file.write(Template("$mpi_tp $abi_nm __attribute__((__aligned__(64))) = (int)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_init.py b/gen/gen_init.py
+index 4939261..0e52822 100755
+--- a/gen/gen_init.py
++++ b/gen/gen_init.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_init_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Initialize C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -25,7 +25,7 @@ with open("src/mpi_init_constants_c.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm)}
+         file.write(Template("$mpi_nm = *($mpi_tp const *)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Initialize C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -39,7 +39,7 @@ with open("src/mpi_init_functions_c.h", "w") as file:
+             subs['anm{0}'.format(i)] = anm
+         file.write(Template("$abi_nm = get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -47,7 +47,7 @@ with open("src/mpi_init_constants_fortran.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm).lower() + "_"}
+         file.write(Template("$abi_nm = *($abi_tp const*)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
diff --git a/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/mpitrampoline/mpitrampoline.BUILD
new file mode 100644
index 00000000000000..cf8e9c336e4e33
--- /dev/null
+++ b/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -0,0 +1,135 @@
+# Description:
+#  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
+
+load("@org_tensorflow//xla:strict.default.bzl", "py_strict_binary")
+load("//third_party/bazel_skylib/rules:expand_template.bzl", "expand_template")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE.md"])
+
+genrule(
+    name = "mpi_version",
+    srcs = [
+        "CMakeLists.txt",
+        "include/mpi_version.h.in",
+    ],
+    outs = ["include/mpi_version.h"],
+    cmd = """
+      PROJECT_VERSION=`cat $(location CMakeLists.txt) \
+                       | grep "MPItrampoline VERSION" | awk '{print $$NF}'`
+      PROJECT_VERSION_MAJOR=`echo $$PROJECT_VERSION | cut -d. -f1`
+      PROJECT_VERSION_MINOR=`echo $$PROJECT_VERSION | cut -d. -f2`
+      PROJECT_VERSION_PATCH=`echo $$PROJECT_VERSION | cut -d. -f3`
+      sed -e "s/@PROJECT_VERSION@/$${PROJECT_VERSION}/" \
+          -e "s/@PROJECT_VERSION_MAJOR@/$${PROJECT_VERSION_MAJOR}/" \
+          -e "s/@PROJECT_VERSION_MINOR@/$${PROJECT_VERSION_MINOR}/" \
+          -e "s/@PROJECT_VERSION_PATCH@/$${PROJECT_VERSION_PATCH}/" \
+          $(location include/mpi_version.h.in) > $(location include/mpi_version.h)
+      """,
+)
+
+expand_template(
+    name = "mpi_defaults",
+    out = "src/mpi_defaults.h",
+    substitutions = {
+        "@MPITRAMPOLINE_DEFAULT_DELAY_INIT@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_BINDING@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_MODE@": "",
+        "@MPITRAMPOLINE_DEFAULT_LIB@": "",
+        "@MPITRAMPOLINE_DEFAULT_PRELOAD@": "",
+        "@MPITRAMPOLINE_DEFAULT_VERBOSE@": "",
+    },
+    template = "src/mpi_defaults.h.in",
+)
+
+py_strict_binary(
+    name = "gen_decl",
+    srcs = [
+        "gen/gen_decl.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "decl",
+    outs = [
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+    ],
+    cmd = "$(location :gen_decl) $(location include/mpi_decl_constants_c.h) \
+           $(location include/mpi_decl_functions_c.h)",
+    tools = [":gen_decl"],
+)
+
+py_strict_binary(
+    name = "gen_defn",
+    srcs = [
+        "gen/gen_defn.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "defn",
+    outs = [
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+    ],
+    cmd = "$(location :gen_defn) $(location include/mpi_defn_constants_c.h) \
+           $(location include/mpi_defn_functions_c.h)",
+    tools = [":gen_defn"],
+)
+
+py_strict_binary(
+    name = "gen_init",
+    srcs = [
+        "gen/gen_init.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "init",
+    outs = [
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+    ],
+    cmd = "$(location :gen_init) $(location include/mpi_init_constants_c.h) \
+           $(location include/mpi_init_functions_c.h)",
+    tools = [":gen_init"],
+)
+
+cc_library(
+    name = "mpitrampoline",
+    srcs = [
+        "src/mpi.c",
+    ],
+    hdrs = [
+        "include/mpi.h",
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+        "include/mpi_version.h",
+        "mpiabi/mpiabi.h",
+        "src/mpi_defaults.h",
+    ],
+    copts = [
+        "-fexceptions",
+    ],
+    includes = [
+        "include",
+        "mpiabi",
+        "src",
+    ],
+)
diff --git a/third_party/mpitrampoline/workspace.bzl b/third_party/mpitrampoline/workspace.bzl
new file mode 100644
index 00000000000000..4748931ae6e368
--- /dev/null
+++ b/third_party/mpitrampoline/workspace.bzl
@@ -0,0 +1,18 @@
+"""Provides the repository macro to import mpitrampoline."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports mpitrampoline."""
+
+    MPITRAMPOLINE_COMMIT = "25efb0f7a4cd00ed82bafb8b1a6285fc50d297ed"
+    MPITRAMPOLINE_SHA256 = "5a36656205c472bdb639bffebb0f014523b32dda0c2cbedd9ce7abfc9e879e84"
+
+    tf_http_archive(
+        name = "mpitrampoline",
+        sha256 = MPITRAMPOLINE_SHA256,
+        strip_prefix = "MPItrampoline-{commit}".format(commit = MPITRAMPOLINE_COMMIT),
+        urls = tf_mirror_urls("https://github.com/eschnett/mpitrampoline/archive/{commit}.tar.gz".format(commit = MPITRAMPOLINE_COMMIT)),
+        patch_file = ["//third_party/mpitrampoline:gen.patch"],
+        build_file = "//third_party/mpitrampoline:mpitrampoline.BUILD",
+    )
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 5d040b95dcd4b8..72f91a68474f97 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -1,10 +1,6 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
@@ -20,6 +16,10 @@ load(
     "GENERATED_SOURCES",
 )
 
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
 NCCL_MAJOR = 2
 
 NCCL_MINOR = 19
diff --git a/third_party/pprof.BUILD b/third_party/pprof.BUILD
index 1b33aa8794ca53..ba25846c709cc2 100644
--- a/third_party/pprof.BUILD
+++ b/third_party/pprof.BUILD
@@ -1,11 +1,11 @@
+load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
+
 package(
     default_visibility = ["//visibility:public"],
 )
 
 licenses(["notice"])  # MIT
 
-load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
-
 exports_files(["pprof/LICENSE"])
 
 py_proto_library(
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index ed999bbe7db342..94971c07102a21 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -174,101 +174,10 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
---- stablehlo/stablehlo/dialect/TypeInference.cpp
-+++ stablehlo/stablehlo/dialect/TypeInference.cpp
-@@ -65,25 +65,6 @@
- 
- namespace mlir {
- namespace hlo {
--namespace {
--//===----------------------------------------------------------------------===//
--// Utils for quantization specific verifications
--//===----------------------------------------------------------------------===//
--template <typename T>
--bool allQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--template <typename T>
--bool noneQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return !val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--}  // namespace
- 
- //===----------------------------------------------------------------------===//
- // Utils for shape functions.
-@@ -3472,61 +3453,6 @@
-                              "is incompatible with return type of operation ",
-                              shapedResultType, "");
- 
--  llvm::SmallVector<Type, 3> typeEntries{lhsType, rhsType, resultType};
--  if (noneQuantized<quant::QuantizedType>(typeEntries)) return success();
--  // convolution_c28
--  if (!allQuantized<quant::QuantizedType>(typeEntries)) {
--    return emitOptionalError(location,
--                             "not all of operands and result are quantized");
--  }
--
--  auto lhsQType =
--      getElementTypeOrSelf(lhsType).dyn_cast<quant::QuantizedType>();
--  auto rhsQType =
--      getElementTypeOrSelf(rhsType).dyn_cast<quant::QuantizedType>();
--  auto resultQType =
--      getElementTypeOrSelf(resultType).dyn_cast<quant::QuantizedType>();
--  // convolution_c29
--  if (lhsQType.getStorageType() != rhsQType.getStorageType())
--    return emitOptionalError(location, "mismatched operand storage types ",
--                             lhsQType.getStorageType(), " and ",
--                             rhsQType.getStorageType());
--  // convolution_c30
--  auto expressedType = lhsQType.getExpressedType();
--  if (expressedType != rhsQType.getExpressedType() ||
--      expressedType != resultQType.getExpressedType())
--    return emitOptionalError(location,
--                             "mismatched operands and result expressed types");
--
--  llvm::SmallVector<Type, 2> typeEntriesPerAxis{rhsType, resultType};
--  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
--    return success();
--  // convolution_c31
--  if (!allQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis)) {
--    return emitOptionalError(location,
--                             "rhs and result are of mixed per_tensor and "
--                             "per_axis quantized tensor type ",
--                             rhsType, " and ", resultType);
--  }
--
--  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  auto resultQPAType =
--      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  // convolution_c32
--  if (rhsQPAType &&
--      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
--    return emitOptionalError(
--        location, "mismatched kernel_output_feature_dimension ",
--        kernelOutputFeatureDimension, " and rhs quantized dimension ",
--        rhsQPAType.getQuantizedDimension());
--  // convolution_c33
--  if (resultQPAType &&
--      resultQPAType.getQuantizedDimension() != outputFeatureDimension)
--    return emitOptionalError(location, "mismatched output_feature_dimension ",
--                             outputFeatureDimension,
--                             " and result quantized dimension ",
--                             resultQPAType.getQuantizedDimension());
--
-   return success();
- }
- 
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,115 @@
+@@ -0,0 +1,116 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -342,6 +251,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
++        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -1994,7 +1904,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,40 @@
+@@ -0,0 +1,41 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2018,6 +1928,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
++  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -2038,7 +1949,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,168 @@
+@@ -0,0 +1,178 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2061,12 +1972,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
 +#include "mlir/IR/PatternMatch.h"
++#include "mlir/Pass/PassManager.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
 +#include "stablehlo/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
 +
 +namespace mlir {
 +namespace stablehlo {
@@ -2204,13 +2117,21 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +  }
 +};
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager& pm) {
++  pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createChloLegalizeToStablehloPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createShapeLegalizeToStablehloPass());
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,36 @@
+@@ -0,0 +1,38 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2242,6 +2163,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager &pm);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -2250,7 +2173,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,55 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2290,6 +2213,22 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
++
++def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
++  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
++  let description = [{
++    An experimental pass to remove dead values prior to running other passes
++    that may fail to converge otherwise. For example, running shape refinement
++    on a program that has a lot of dead values can fail because shape refinement
++    is top down and removing values causes a new iteration to be triggered, and
++    removing all the dead values with a top down traversal can take a lot of
++    iterations (10+), which is slow.
++
++    Performing a single pass should be fast, and doing it bottom up means that
++    values that are transitively dead can be removed since leaf values will be
++    processed first.
++  }];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2639,83 +2578,1252 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-@@ -821,75 +821,3 @@
-   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<4x!quant.uniform<si8:f32, 1.000000e+00>>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
- }
--
--// -----
--
--func.func @convolution_c28(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{not all of operands and result are quantized}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207xf32>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c29(%arg0: tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operand storage types 'i16' and 'i8'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c30(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operands and result expressed types}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{rhs and result are of mixed per_tensor and per_axis quantized tensor type 'tensor<3x3x207x16x!quant.uniform<i8:f32:0, {1.000000e-01:-30}>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32, 1.000000e+01:50>>'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+@@ -0,0 +1,63 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++struct StablehloTrivialDcePass
++    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
++  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++
++    // Hardcode defaults for stability.
++    config.enableRegionSimplification = true;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    // Run a single bottom up pass.
++    config.useTopDownTraversal = false;
++    config.maxIterations = 1;
++
++    // Running a greedy rewrite will cause trivially dead values to be removed.
++    // Doing it without patterns ensures that no other changes are made to the
++    // IR. Doing it bottom-up ensures that values that are transitively dead are
++    // also removed. Although 1 pass should be enough,
++    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
++    // to confirm convergence, but we don't need to check for convergence, so we
++    // ignore the return value.
++    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1283,153 +1283,153 @@
+ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
+   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
+   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
+-  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
+-  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
+-  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
+-  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
+-  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
+-  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
+-  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
++  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
++  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
++  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
++  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
++  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
++  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
++  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
+   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
+-  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
+-  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
++  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
+   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
+-  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
+-  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
+   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
+-  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
+-  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
+   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
+-  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
+-  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
+   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
+-  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
+-  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
+   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
+-  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
+-  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
+   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
+-  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
+-  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
+   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
+-  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
+-  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
++  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
+   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
+-  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
+-  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
+-  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
+-  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+-  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+-  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+-  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+-  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+-  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+-  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+-  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+-  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+-  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+-  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+-  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+-  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+-  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+-  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+-  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+-  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+-  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+-  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+-  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+-  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+-  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+-  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+-  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+-  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+-  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+-  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+-  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+-  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+-  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+-  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+-  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+-  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+-  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+-  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+-  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+-  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
++  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
++  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
++  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
++  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
++  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
++  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
++  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
++  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
++  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
++  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
++  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
++  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
++  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
++  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
++  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
++  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
++  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
++  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
++  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
++  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
++  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
++  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
++  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
++  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
++  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
++  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
++  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
++  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
++  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
++  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
++  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
++  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
++  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
+   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
+   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1456,7 +1456,7 @@
+   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
+   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
++  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
+   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
+   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
+@@ -1465,8 +1465,7 @@
+ 
+ // -----
+ 
 -
--// -----
+-// CHECK-LABEL: @polygamma_f32
++// CHECK: @polygamma_f32
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1559,153 +1558,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1732,7 +1731,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+@@ -1853,8 +1852,7 @@
+ 
+ // -----
+ 
 -
--func.func @convolution_c32(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>> {
--  // expected-error@+1 {{mismatched kernel_output_feature_dimension 3 and rhs quantized dimension 0}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--}
+-// CHECK-LABEL: @polygamma_f64
++// CHECK: @polygamma_f64
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
+ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1947,153 +1945,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
+@@ -2120,7 +2118,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,11 +1575,21 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
+-  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
++  // Implementation ported from:
++  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
++  // Reference: Johansson, Fredrik.
++  // "Rigorous high-precision computation of the Hurwitz zeta function and its
++  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
++  // https://arxiv.org/abs/1309.2877 - formula (5)
++  // Notation is more or less kept as a reference to the whitepaper.
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+-  static const std::array<double, 12> kZetaCoeffs{
++
++  static constexpr auto kTerms = 12;
++  static constexpr auto kIters = 9;
++  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
++  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
+       -7.1661652561756670113e18,
+       1.8152105401943546773e17,
+       -4.5979787224074726105e15,
+@@ -1596,131 +1606,134 @@
+ 
+   // For speed we'll always use 9 iterations for the initial series estimate,
+   // and a 12 term expansion for the Euler-Maclaurin formula.
+-  Value a = q;
+-  Value zero = getConstantLike(rewriter, loc, 0.0, a);
+-  Value negPower = zero;
+-  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
+-  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
+-  Value one = getConstantLike(rewriter, loc, 1.0, a);
+-  for (int i = 0; i < 9; ++i) {
+-    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
+-    initialSum =
+-        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
+-  }
 -
--// -----
+-  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
++  Value zero = getConstantLike(rewriter, loc, 0.0, q);
++  Value one = getConstantLike(rewriter, loc, 1.0, q);
++  Value acc = q;
++  Value qNegPower = zero;
++  Value negX = rewriter.create<NegOp>(loc, x);
++  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
++  for (int i = 0; i < kIters; ++i) {
++    acc = rewriter.create<AddOp>(loc, acc, one);
++    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
++    powerSum =
++        rewriter.create<AddOp>(loc, powerSum, qNegPower);
++  }
++  acc = rewriter.create<AddOp>(loc, acc, one);
++  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
+   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
+-  Value xMinusOne =
+-      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
+-  Value negPowerMulA =
+-      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
+-  Value negPowerMulADivXMinusOne =
+-      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
+-  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
+-                                                    negPowerMulADivXMinusOne);
+-  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
+-      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
 -
--func.func @convolution_c33(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>> {
-- // expected-error@+1 {{mismatched output_feature_dimension 3 and result quantized dimension 0}}
-- %0 = stablehlo.convolution(%arg0, %arg1)
--     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--     window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--     {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
-- func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
--}
+-  Value hornerSum = zero;
+-  Value factor = one;
++  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
++      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
++      rewriter.create<SubtractOp>(loc, x, oneLikeX));
++
++  // Manual reciprocal of the square root as RsqrtOp produces different results
++  Value rsqrtAcc = rewriter.create<DivOp>(
++      loc, one, rewriter.create<MulOp>(loc, acc, acc));
++
+   // Use Horner's rule for this.
+   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+-  for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+-    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+-    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
+-        loc, factor,
+-        rewriter.create<mlir::stablehlo::MulOp>(
+-            loc, aInverseSquare,
+-            rewriter.create<mlir::stablehlo::AddOp>(
++  Value hornerSum = zero;
++  Value hornerProduct = one;
++
++  for (int i = 0; i < kTerms - 1; ++i) {
++    Value factorLhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
++    Value factorRhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
++    hornerProduct =
++        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
++    hornerSum = rewriter.create<MulOp>(
++        loc, hornerProduct,
++        rewriter.create<MulOp>(
++            loc, rsqrtAcc,
++            rewriter.create<AddOp>(
+                 loc, hornerSum,
+-                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
+-  }
+-  Value zeroPointFiveLikeNegPower =
+-      getConstantLike(rewriter, loc, .5, negPower);
+-  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
+-  s = rewriter.create<mlir::stablehlo::AddOp>(
+-      loc, s,
+-      rewriter.create<mlir::stablehlo::MulOp>(
+-          loc, negPower,
+-          rewriter.create<mlir::stablehlo::AddOp>(
+-              loc, zeroPointFiveLikeNegPower,
+-              rewriter.create<mlir::stablehlo::MulOp>(
+-                  loc, xDivA,
+-                  rewriter.create<mlir::stablehlo::AddOp>(
+-                      loc,
+-                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
+-                      hornerSum)))));
++                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
++  }
++  Value zeroPointFiveLikeQNegPower =
++      getConstantLike(rewriter, loc, .5, qNegPower);
++  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
++  Value bernoulliTailTerm = rewriter.create<MulOp>(
++      loc, qNegPower,
++      rewriter.create<AddOp>(
++          loc, zeroPointFiveLikeQNegPower,
++          rewriter.create<MulOp>(
++              loc, xDivAcc,
++              rewriter.create<AddOp>(
++                  loc,
++                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
++                                  acc),
++                  hornerSum))));
++  Value accurateResult = rewriter.create<AddOp>(
++      loc,
++      rewriter.create<AddOp>(loc, powerSum,
++                                              correctionEulerMaclaurin),
++      bernoulliTailTerm);
+ 
+   // Use the initial zeta sum without the correction term coming
+   // from Euler-Maclaurin if it is accurate enough.
+-  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
+-  Value absInitialSum =
+-      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
+-  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
++  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
++  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
++  Value output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, absNegPower,
+-          rewriter.create<mlir::stablehlo::MulOp>(
+-              loc, absInitialSum,
+-              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
+-          mlir::stablehlo::ComparisonDirection::LT),
+-      initialSum, s);
++      rewriter.create<CompareOp>(
++          loc, absQNegPower,
++          rewriter.create<MulOp>(
++              loc, absPowerSum,
++              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
++          ComparisonDirection::LT),
++      powerSum, accurateResult);
+ 
+   // Function is not defined for x < 1.
+   Value nan = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::quiet_NaN(), x);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
++      rewriter.create<CompareOp>(
++          loc, x, oneLikeX, ComparisonDirection::LT),
+       nan, output);
+ 
+   // For q <= 0, x must be an integer.
+-  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
+-  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::NE);
++  Value qLeZero = rewriter.create<CompareOp>(
++      loc, q, zero, ComparisonDirection::LE);
++  Value xNotInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::NE);
+   Value xDomainError =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
++      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
++  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
+                                                       output);
+ 
+   // For all integer q <= 0, zeta has a pole. The limit is only defined as
+   // +inf if x is and even integer.
+   Value inf = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::infinity(), x);
+-  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
++  Value qIsInt = rewriter.create<CompareOp>(
++      loc, q, rewriter.create<FloorOp>(loc, q),
++      ComparisonDirection::EQ);
++  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
+   Value two = getConstantLike(rewriter, loc, 2.0, x);
+-  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
+-      mlir::stablehlo::ComparisonDirection::EQ);
++  Value xIsInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::EQ);
++  Value xIsEven = rewriter.create<CompareOp>(
++      loc, rewriter.create<RemOp>(loc, x, two), zero,
++      ComparisonDirection::EQ);
+   Value xIsEvenInt =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
++  output = rewriter.create<SelectOp>(
+       loc, atPole,
+-      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
++      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
+       output);
+ 
+   // For x = 1, this is the harmonic series and diverges.
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
++      rewriter.create<CompareOp>(
++          loc, x, one, ComparisonDirection::EQ),
+       inf, output);
+ 
+   return output;
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 674fda41985b9a..ca2f3a937f73ad 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "aa69baea1409d7c341705e0e9342ed62802d8a4d"
-    STABLEHLO_SHA256 = "4a95367f09657343704dff27c651bb0e648a205d5b82acbe35cd160b0d87ff35"
+    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
+    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
index 4d05ab28d12e99..c7d940605f9f72 100644
--- a/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -1,10 +1,10 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
     "proto_gen",
     "py_proto_library",
 )
+load("@rules_proto//proto:defs.bzl", "proto_library")
 
 licenses(["notice"])
 
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 0e4affe9180311..2b789c8cead317 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9c80654b96f0fcf47cb636500f47edbe1fbc555e"
-    TFRT_SHA256 = "491e224c2d8d4be312ed0de65d6ab5a6912aa1d18b03439f6531f399b72ff385"
+    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
+    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/cl614757739.patch b/third_party/triton/cl614757739.patch
deleted file mode 100644
index 058238e0005310..00000000000000
--- a/third_party/triton/cl614757739.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/python/src/ir.cc#8 - /google/src/cloud/shyshkov/mlir_a924da6d4b8733e5bf08098b18dd7ad1a5ba5f46_1710185763/triton/python/src/ir.cc ====
-# action=edit type=text
---- triton/python/src/ir.cc	2024-02-21 21:40:18.000000000 -0800
-+++ triton/python/src/ir.cc	2024-03-11 12:39:44.000000000 -0700
-@@ -205,7 +205,8 @@
-   });
- 
-   py::class_<Type>(m, "type", py::module_local())
--      .def("is_integer", &Type::isInteger)
-+      .def("is_integer",
-+           [](Type &self, unsigned width) { return self.isInteger(width); })
-       .def("is_fp16", &Type::isF16)
-       .def("__str__", [](Type &self) {
-         std::string str;
diff --git a/third_party/triton/cl616113650.patch b/third_party/triton/cl616113650.patch
deleted file mode 100644
index 4dc3675d9a9ea5..00000000000000
--- a/third_party/triton/cl616113650.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#30 - triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-11 11:42:57.000000000 -0700
-+++ triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-15 06:57:27.000000000 -0700
-@@ -510,10 +510,8 @@
-                                    ConversionPatternRewriter &rewriter,
-                                    Type elemTy, MultipleOperandsRange operands,
-                                    Location loc) const {
--    auto boolFalse = rewriter.getBoolAttr(false);
--    auto constFalse = rewriter.create<LLVM::ConstantOp>(loc, boolFalse);
-     return {rewriter.create<LLVM::AbsOp>(loc, elemTy, operands[0][0],
--                                         /*is_int_min_poison=*/constFalse)};
-+                                         /*is_int_min_poison=*/false)};
-   }
- };
- 
diff --git a/third_party/triton/cl617812302.patch b/third_party/triton/cl617812302.patch
new file mode 100644
index 00000000000000..c9c8066190e6ce
--- /dev/null
+++ b/third_party/triton/cl617812302.patch
@@ -0,0 +1,12 @@
+==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
++++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
+@@ -692,6 +692,7 @@
+         "@llvm-project//mlir:SCFDialect",
+         "@llvm-project//mlir:SCFTransforms",
+         "@llvm-project//mlir:SCFUtils",
++        "@llvm-project//mlir:SideEffectInterfaces",
+         "@llvm-project//mlir:Support",
+         "@llvm-project//mlir:TensorDialect",
+         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/triton/cl619146327.patch b/third_party/triton/cl619146327.patch
new file mode 100644
index 00000000000000..4f9c1e4b971c0a
--- /dev/null
+++ b/third_party/triton/cl619146327.patch
@@ -0,0 +1,52 @@
+This patch can be removed once this commit is included:
+https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
+
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
+           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+           perPhase = std::max<int>(perPhase, 1);
+           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
+-          // for now, disable swizzle when using transposed int8 tensor cores
+-          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
+-            return get(context, 1, 1, 1, order, CTALayout);
++          int vecWidth = 32 / typeWidthInBit;
++          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
++              perPhase = std::max<int>(perPhase, 2 * vecWidth);
++          }
+           int rank = order.size();
+           // --- handle A operand ---
+           if (opIdx == 0) { // compute swizzling for A operand
+diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/TritonGPU/reduce-data-duplication.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
++//       CHECK:   apply_swizzle
++//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
++module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
++    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
++    tt.return 
++  } 
++}
+\ No newline at end of file
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
+   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
+   auto order = sharedLayout.getOrder();
+ 
+-  if (kWidth != (4 / elemBytes))
+-    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
+   int nPerWarp =
+       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
+ 
diff --git a/third_party/triton/cl619443019.patch b/third_party/triton/cl619443019.patch
new file mode 100644
index 00000000000000..95ce54b6e4d6aa
--- /dev/null
+++ b/third_party/triton/cl619443019.patch
@@ -0,0 +1,76 @@
+==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
++++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
+@@ -620,6 +620,7 @@
+         "@llvm-project//mlir:FunctionInterfaces",
+         "@llvm-project//mlir:GPUDialect",
+         "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InliningUtils",
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:MathDialect",
+         "@llvm-project//mlir:SCFDialect",
+@@ -628,6 +629,7 @@
+         # The following is added to make Utility compile
+         ":TritonTools",
+         "@llvm-project//mlir:LLVMCommonConversion",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -646,6 +648,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
+@@ -729,6 +732,7 @@
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -780,6 +784,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:IndexDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
+# action=edit type=text
+--- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
+@@ -53,6 +53,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
+==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
+# action=edit type=text
+--- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
+@@ -66,6 +66,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonDialects",
+     ],
+@@ -113,6 +114,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 8eabda8c9eddcd..2773b250ac8554 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl612852008"
-    TRITON_SHA256 = "f2e330075469dd230ae5c72e4a2c9765c7851f7f6de5ad10f58578c07fc1dca4"
+    TRITON_COMMIT = "cl617459344"
+    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,7 +15,8 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl614757739.patch",
-            "//third_party/triton:cl616113650.patch",
+            "//third_party/triton:cl617812302.patch",
+            "//third_party/triton:cl619146327.patch",
+            "//third_party/triton:cl619443019.patch",
         ],
     )
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 1e297772408ba6..d8990ac5c12cc5 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -597,8 +597,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -679,12 +683,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -714,10 +720,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt
@@ -922,3 +930,6 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index dde19cb3d7e3fb..87bf19cd7ad71f 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -93,6 +93,7 @@ docker exec xla bazel \
         --features=layering_check \
         --profile=/tf/pkg/profile.json.gz \
         --flaky_test_attempts=3 \
+        --config=warnings \
         $RBE_FLAGS \
         $ADDITIONAL_FLAGS \
         -- //xla/... //build_tools/... @local_tsl//tsl/... $TARGET_FILTERS
diff --git a/third_party/xla/README.md b/third_party/xla/README.md
index 65e8b8f2211c46..be0325eefc03ba 100644
--- a/third_party/xla/README.md
+++ b/third_party/xla/README.md
@@ -28,7 +28,7 @@ and then see the [developer guide](docs/developer_guide.md).
 
 ## Contacts
 
-*   For questions, contact Thea Lamkin - thealamkin at google.com.
+*   For questions, contact the maintainers - maintainers at openxla.org
 
 ## Resources
 
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 374838de234e01..7ba74d6276c2e4 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -1,3 +1,4 @@
+# buildifier: disable=load-on-top
 workspace(name = "xla")
 
 # Initialize the XLA repository and all dependencies.
diff --git a/third_party/xla/build_tools/configure/BUILD b/third_party/xla/build_tools/configure/BUILD
index 90d63687cfb1b7..3be5b6a0446109 100644
--- a/third_party/xla/build_tools/configure/BUILD
+++ b/third_party/xla/build_tools/configure/BUILD
@@ -13,10 +13,9 @@
 # limitations under the License.
 # ============================================================================
 
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-
 # Placeholder: load py_test
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("//xla:pytype.default.bzl", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/docs/build_from_source.md b/third_party/xla/docs/build_from_source.md
index 91ef1e49608818..c273f7f3cdf8c0 100644
--- a/third_party/xla/docs/build_from_source.md
+++ b/third_party/xla/docs/build_from_source.md
@@ -10,13 +10,12 @@ If you did not clone the XLA repository or install Bazel, please check out the
 ### Configure
 
 XLA builds are configured by the `.bazelrc` file in the repository's root
-directory. The `./configure.py` script can be used to adjust
-common settings.
+directory. The `./configure.py` script can be used to adjust common settings.
 
-If you need to change the configuration, run the `./configure.py` script from the
-repository's root directory. This script has flags for the location of XLA
-dependencies and additional build configuration options (compiler
-flags, for example). Refer to the *Sample session* section for details.
+If you need to change the configuration, run the `./configure.py` script from
+the repository's root directory. This script has flags for the location of XLA
+dependencies and additional build configuration options (compiler flags, for
+example). Refer to the *Sample session* section for details.
 
 ### CPU support
 
@@ -27,26 +26,29 @@ We recommend using a suitable docker container to build/test XLA, such as
 docker run --name xla -w /xla -it -d --rm -v $PWD:/xla tensorflow/build:latest-python3.9 bash
 ```
 
-Using a docker container you can build XLA with CPU support using the following commands:
+Using a docker container you can build XLA with CPU support using the following
+commands:
 
 ```
 docker exec xla ./configure.py --backend=CPU
 docker exec xla bazel build //xla/...  --spawn_strategy=sandboxed --test_output=all
 ```
 
-If you want to build XLA targets with CPU support without Docker you need to install clang. XLA currently builds on CI with clang-17, but earlier versions should also work:
+If you want to build XLA targets with CPU support without Docker you need to
+install clang. XLA currently builds on CI with clang-17, but earlier versions
+should also work:
 
 ```
 apt install clang
 ```
 
 Then configure and build targets using the following commands:
-``` ./configure.py --backend=CPU
 
+```sh
+./configure.py --backend=CPU
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
-
 ### GPU support
 
 We recommend using the same docker container as above to build XLA with GPU
@@ -76,6 +78,5 @@ Then configure and build targets using the following commands:
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
-
 For more details regarding
 [TensorFlow's GPU docker images you can check out this document.](https://www.tensorflow.org/install/source#gpu_support_3)
diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
index f0cd37bc50273e..4ee61f16f62033 100644
--- a/third_party/xla/docs/indexing.md
+++ b/third_party/xla/docs/indexing.md
@@ -123,7 +123,7 @@ Let's study-by-example to understand what's all of the above actually means.
 
 ### Elementwise
 
-For an elementwise ops the indexing map is an identity.
+For elementwise ops the indexing map is an identity.
 
 ```c++
   p0 = f32[10, 20] parameter(0)
@@ -141,7 +141,7 @@ The output to input maps:
 The input to output maps
 
 -   input_i -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$
+    {\rm Dom}(input)$
 
 ### [Broadcast](https://openxla.org/xla/operation_semantics#broadcastindim)
 
@@ -160,8 +160,8 @@ The output to input map:
 
 The input to output map
 
--   input -> output: $(d_0) \mapsto (s_0, d_1, s_1)$ for $\boldsymbol{d} \in
-    {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
+-   input -> output: $(d_0) \mapsto (s_0, d_0, s_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(input)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
 
 Note that now we have $\boldsymbol s$ on the right side for the input-to-output
 mapping. Those are the symbols that represent ranges of values. For example, in
diff --git a/third_party/xla/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/xla/third_party/implib_so/workspace.bzl
+++ b/third_party/xla/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD
index 71a21b4e3786bb..31bf29f966c495 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/llvm_openmp/BUILD
@@ -1,5 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -16,7 +17,6 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/mpitrampoline/BUILD b/third_party/xla/third_party/mpitrampoline/BUILD
new file mode 100644
index 00000000000000..3c413807167aeb
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/mpitrampoline/gen.patch b/third_party/xla/third_party/mpitrampoline/gen.patch
new file mode 100644
index 00000000000000..35124db0abb1e3
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/gen.patch
@@ -0,0 +1,149 @@
+diff --git a/gen/gen_decl.py b/gen/gen_decl.py
+index 1005b95..696b4e0 100755
+--- a/gen/gen_decl.py
++++ b/gen/gen_decl.py
+@@ -9,8 +9,8 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+@@ -24,7 +24,7 @@ def wrap(line):
+     lines.append(line)
+     return "\n".join(lines)
+ 
+-with open("include/mpi_decl_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Declare C MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -32,7 +32,7 @@ with open("include/mpi_decl_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("extern $mpi_tp MPITRAMPOLINE_CONST $mpi_nm;\n").substitute(subs))
+ 
+-with open("include/mpi_decl_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Declare C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -90,7 +90,7 @@ with open("include/mpi_decl_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_constants_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -104,7 +104,7 @@ with open("include/mpi_decl_constants_fortran.h", "w") as file:
+         file.write("\n".join(map(lambda line: wrap(Template(line).substitute(subs)), tmpl)))
+         file.write("\n")
+ 
+-with open("include/mpi_decl_functions_fortran.h", "w") as file:
++if False:
+     file.write("!     Declare Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_defn.py b/gen/gen_defn.py
+index bf31f35..318222e 100755
+--- a/gen/gen_defn.py
++++ b/gen/gen_defn.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_defn_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Define C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -24,7 +24,7 @@ with open("src/mpi_defn_constants_c.h", "w") as file:
+                 'mpi_nm': nm}
+         file.write(Template("$mpi_tp $mpi_nm = ($mpi_tp)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Define C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -89,7 +89,7 @@ with open("src/mpi_defn_functions_c.h", "w") as file:
+         file.write(Template("\n".join(tmpl)).substitute(subs))
+         file.write("\n")
+ 
+-with open("src/mpi_defn_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -98,7 +98,7 @@ with open("src/mpi_defn_constants_fortran.h", "w") as file:
+         # Fortran common blocks with `-march=skylake-avx512` are aligned to 64 bytes
+         file.write(Template("$mpi_tp $abi_nm __attribute__((__aligned__(64))) = (int)0xdeadbeef;\n").substitute(subs))
+ 
+-with open("src/mpi_defn_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Define Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
+diff --git a/gen/gen_init.py b/gen/gen_init.py
+index 4939261..0e52822 100755
+--- a/gen/gen_init.py
++++ b/gen/gen_init.py
+@@ -9,14 +9,14 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "mpiabi"))
+ 
+ from mpi_constants import constants
+ from mpi_functions import functions
+-from mpi_constants_fortran import constants_fortran
+-from mpi_functions_fortran import functions_fortran
++# from mpi_constants_fortran import constants_fortran
++# from mpi_functions_fortran import functions_fortran
+ 
+ support_profiling = True
+ have_weak_symbols = False
+ replace_sentinels = False
+ 
+-with open("src/mpi_init_constants_c.h", "w") as file:
++with open(sys.argv[1], "w") as file:
+     file.write("// Initialize C MPI constants")
+     file.write("\n")
+     for (tp, nm) in constants:
+@@ -25,7 +25,7 @@ with open("src/mpi_init_constants_c.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm)}
+         file.write(Template("$mpi_nm = *($mpi_tp const *)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_c.h", "w") as file:
++with open(sys.argv[2], "w") as file:
+     file.write("// Initialize C MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args, flags) in functions:
+@@ -39,7 +39,7 @@ with open("src/mpi_init_functions_c.h", "w") as file:
+             subs['anm{0}'.format(i)] = anm
+         file.write(Template("$abi_nm = get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_constants_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI constants\n")
+     file.write("\n")
+     for (tp, nm) in constants_fortran:
+@@ -47,7 +47,7 @@ with open("src/mpi_init_constants_fortran.h", "w") as file:
+                 'abi_nm': re.sub(r"MPI(X?)_", r"MPI\1ABI_", nm).lower() + "_"}
+         file.write(Template("$abi_nm = *($abi_tp const*)get_symbol(handle, \"$abi_nm\");\n").substitute(subs))
+ 
+-with open("src/mpi_init_functions_fortran.h", "w") as file:
++if False:
+     file.write("// Initialize Fortran MPI functions\n")
+     file.write("\n")
+     for (tp, nm, args) in functions_fortran:
diff --git a/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
new file mode 100644
index 00000000000000..f46e39d762a159
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/mpitrampoline.BUILD
@@ -0,0 +1,135 @@
+# Description:
+#  A forwarding MPI implementation that can use any other MPI implementation via an MPI ABI
+
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_xla//xla:strict.default.bzl", "py_strict_binary")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE.md"])
+
+genrule(
+    name = "mpi_version",
+    srcs = [
+        "CMakeLists.txt",
+        "include/mpi_version.h.in",
+    ],
+    outs = ["include/mpi_version.h"],
+    cmd = """
+      PROJECT_VERSION=`cat $(location CMakeLists.txt) \
+                       | grep "MPItrampoline VERSION" | awk '{print $$NF}'`
+      PROJECT_VERSION_MAJOR=`echo $$PROJECT_VERSION | cut -d. -f1`
+      PROJECT_VERSION_MINOR=`echo $$PROJECT_VERSION | cut -d. -f2`
+      PROJECT_VERSION_PATCH=`echo $$PROJECT_VERSION | cut -d. -f3`
+      sed -e "s/@PROJECT_VERSION@/$${PROJECT_VERSION}/" \
+          -e "s/@PROJECT_VERSION_MAJOR@/$${PROJECT_VERSION_MAJOR}/" \
+          -e "s/@PROJECT_VERSION_MINOR@/$${PROJECT_VERSION_MINOR}/" \
+          -e "s/@PROJECT_VERSION_PATCH@/$${PROJECT_VERSION_PATCH}/" \
+          $(location include/mpi_version.h.in) > $(location include/mpi_version.h)
+      """,
+)
+
+expand_template(
+    name = "mpi_defaults",
+    out = "src/mpi_defaults.h",
+    substitutions = {
+        "@MPITRAMPOLINE_DEFAULT_DELAY_INIT@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_BINDING@": "",
+        "@MPITRAMPOLINE_DEFAULT_DLOPEN_MODE@": "",
+        "@MPITRAMPOLINE_DEFAULT_LIB@": "",
+        "@MPITRAMPOLINE_DEFAULT_PRELOAD@": "",
+        "@MPITRAMPOLINE_DEFAULT_VERBOSE@": "",
+    },
+    template = "src/mpi_defaults.h.in",
+)
+
+py_strict_binary(
+    name = "gen_decl",
+    srcs = [
+        "gen/gen_decl.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "decl",
+    outs = [
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+    ],
+    cmd = "$(location :gen_decl) $(location include/mpi_decl_constants_c.h) \
+           $(location include/mpi_decl_functions_c.h)",
+    tools = [":gen_decl"],
+)
+
+py_strict_binary(
+    name = "gen_defn",
+    srcs = [
+        "gen/gen_defn.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "defn",
+    outs = [
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+    ],
+    cmd = "$(location :gen_defn) $(location include/mpi_defn_constants_c.h) \
+           $(location include/mpi_defn_functions_c.h)",
+    tools = [":gen_defn"],
+)
+
+py_strict_binary(
+    name = "gen_init",
+    srcs = [
+        "gen/gen_init.py",
+        "mpiabi/mpi_constants.py",
+        "mpiabi/mpi_functions.py",
+    ],
+)
+
+genrule(
+    name = "init",
+    outs = [
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+    ],
+    cmd = "$(location :gen_init) $(location include/mpi_init_constants_c.h) \
+           $(location include/mpi_init_functions_c.h)",
+    tools = [":gen_init"],
+)
+
+cc_library(
+    name = "mpitrampoline",
+    srcs = [
+        "src/mpi.c",
+    ],
+    hdrs = [
+        "include/mpi.h",
+        "include/mpi_decl_constants_c.h",
+        "include/mpi_decl_functions_c.h",
+        "include/mpi_defn_constants_c.h",
+        "include/mpi_defn_functions_c.h",
+        "include/mpi_init_constants_c.h",
+        "include/mpi_init_functions_c.h",
+        "include/mpi_version.h",
+        "mpiabi/mpiabi.h",
+        "src/mpi_defaults.h",
+    ],
+    copts = [
+        "-fexceptions",
+    ],
+    includes = [
+        "include",
+        "mpiabi",
+        "src",
+    ],
+)
diff --git a/third_party/xla/third_party/mpitrampoline/workspace.bzl b/third_party/xla/third_party/mpitrampoline/workspace.bzl
new file mode 100644
index 00000000000000..4748931ae6e368
--- /dev/null
+++ b/third_party/xla/third_party/mpitrampoline/workspace.bzl
@@ -0,0 +1,18 @@
+"""Provides the repository macro to import mpitrampoline."""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports mpitrampoline."""
+
+    MPITRAMPOLINE_COMMIT = "25efb0f7a4cd00ed82bafb8b1a6285fc50d297ed"
+    MPITRAMPOLINE_SHA256 = "5a36656205c472bdb639bffebb0f014523b32dda0c2cbedd9ce7abfc9e879e84"
+
+    tf_http_archive(
+        name = "mpitrampoline",
+        sha256 = MPITRAMPOLINE_SHA256,
+        strip_prefix = "MPItrampoline-{commit}".format(commit = MPITRAMPOLINE_COMMIT),
+        urls = tf_mirror_urls("https://github.com/eschnett/mpitrampoline/archive/{commit}.tar.gz".format(commit = MPITRAMPOLINE_COMMIT)),
+        patch_file = ["//third_party/mpitrampoline:gen.patch"],
+        build_file = "//third_party/mpitrampoline:mpitrampoline.BUILD",
+    )
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index ed999bbe7db342..94971c07102a21 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -174,101 +174,10 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
  add_subdirectory(integrations)
  add_subdirectory(reference)
  add_subdirectory(tests)
-diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
---- stablehlo/stablehlo/dialect/TypeInference.cpp
-+++ stablehlo/stablehlo/dialect/TypeInference.cpp
-@@ -65,25 +65,6 @@
- 
- namespace mlir {
- namespace hlo {
--namespace {
--//===----------------------------------------------------------------------===//
--// Utils for quantization specific verifications
--//===----------------------------------------------------------------------===//
--template <typename T>
--bool allQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--template <typename T>
--bool noneQuantized(ArrayRef<Type> typeRange) {
--  return llvm::all_of(typeRange, [&](Type val) {
--    return !val.cast<ShapedType>().getElementType().isa<T>();
--  });
--}
--
--}  // namespace
- 
- //===----------------------------------------------------------------------===//
- // Utils for shape functions.
-@@ -3472,61 +3453,6 @@
-                              "is incompatible with return type of operation ",
-                              shapedResultType, "");
- 
--  llvm::SmallVector<Type, 3> typeEntries{lhsType, rhsType, resultType};
--  if (noneQuantized<quant::QuantizedType>(typeEntries)) return success();
--  // convolution_c28
--  if (!allQuantized<quant::QuantizedType>(typeEntries)) {
--    return emitOptionalError(location,
--                             "not all of operands and result are quantized");
--  }
--
--  auto lhsQType =
--      getElementTypeOrSelf(lhsType).dyn_cast<quant::QuantizedType>();
--  auto rhsQType =
--      getElementTypeOrSelf(rhsType).dyn_cast<quant::QuantizedType>();
--  auto resultQType =
--      getElementTypeOrSelf(resultType).dyn_cast<quant::QuantizedType>();
--  // convolution_c29
--  if (lhsQType.getStorageType() != rhsQType.getStorageType())
--    return emitOptionalError(location, "mismatched operand storage types ",
--                             lhsQType.getStorageType(), " and ",
--                             rhsQType.getStorageType());
--  // convolution_c30
--  auto expressedType = lhsQType.getExpressedType();
--  if (expressedType != rhsQType.getExpressedType() ||
--      expressedType != resultQType.getExpressedType())
--    return emitOptionalError(location,
--                             "mismatched operands and result expressed types");
--
--  llvm::SmallVector<Type, 2> typeEntriesPerAxis{rhsType, resultType};
--  if (noneQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis))
--    return success();
--  // convolution_c31
--  if (!allQuantized<quant::UniformQuantizedPerAxisType>(typeEntriesPerAxis)) {
--    return emitOptionalError(location,
--                             "rhs and result are of mixed per_tensor and "
--                             "per_axis quantized tensor type ",
--                             rhsType, " and ", resultType);
--  }
--
--  auto rhsQPAType = rhsQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  auto resultQPAType =
--      resultQType.dyn_cast<quant::UniformQuantizedPerAxisType>();
--  // convolution_c32
--  if (rhsQPAType &&
--      rhsQPAType.getQuantizedDimension() != kernelOutputFeatureDimension)
--    return emitOptionalError(
--        location, "mismatched kernel_output_feature_dimension ",
--        kernelOutputFeatureDimension, " and rhs quantized dimension ",
--        rhsQPAType.getQuantizedDimension());
--  // convolution_c33
--  if (resultQPAType &&
--      resultQPAType.getQuantizedDimension() != outputFeatureDimension)
--    return emitOptionalError(location, "mismatched output_feature_dimension ",
--                             outputFeatureDimension,
--                             " and result quantized dimension ",
--                             resultQPAType.getQuantizedDimension());
--
-   return success();
- }
- 
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,115 @@
+@@ -0,0 +1,116 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -342,6 +251,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
++        "transforms/StablehloTrivialDce.cpp",
 +    ],
 +    hdrs = [
 +        "transforms/Passes.h",
@@ -1994,7 +1904,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,40 @@
+@@ -0,0 +1,41 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2018,6 +1928,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
++  StablehloTrivialDce.cpp
 +
 +  DEPENDS
 +  ExperimentalPassesIncGen
@@ -2038,7 +1949,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 --- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
 +++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
-@@ -0,0 +1,168 @@
+@@ -0,0 +1,178 @@
 +/* Copyright 2024 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2061,12 +1972,14 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +#include "mlir/IR/Attributes.h"
 +#include "mlir/IR/BuiltinAttributes.h"
 +#include "mlir/IR/PatternMatch.h"
++#include "mlir/Pass/PassManager.h"
 +#include "mlir/Support/LogicalResult.h"
 +#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 +#include "stablehlo/dialect/ChloOps.h"
 +#include "stablehlo/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/dialect/StablehloOps.h"
 +#include "stablehlo/experimental/transforms/Passes.h"
++#include "stablehlo/transforms/Passes.h"
 +
 +namespace mlir {
 +namespace stablehlo {
@@ -2204,13 +2117,21 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/
 +  }
 +};
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager& pm) {
++  pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createChloLegalizeToStablehloPass());
++  pm.addNestedPass<mlir::func::FuncOp>(
++      mlir::stablehlo::createShapeLegalizeToStablehloPass());
++}
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,36 @@
+@@ -0,0 +1,38 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2242,6 +2163,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
++void createChloLegalizeToStablehloPipeline(OpPassManager &pm);
++
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -2250,7 +2173,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,55 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2290,6 +2213,22 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +  }];
 +  let dependentDialects = ["chlo::ChloDialect"];
 +}
++
++def StablehloTrivialDcePass : Pass<"experimental-stablehlo-trivial-dce", "ModuleOp"> {
++  let summary = "(Experimental) Performs a single bottom up pass to remove values that are trivially dead.";
++  let description = [{
++    An experimental pass to remove dead values prior to running other passes
++    that may fail to converge otherwise. For example, running shape refinement
++    on a program that has a lot of dead values can fail because shape refinement
++    is top down and removing values causes a new iteration to be triggered, and
++    removing all the dead values with a top down traversal can take a lot of
++    iterations (10+), which is slow.
++
++    Performing a single pass should be fast, and doing it bottom up means that
++    values that are transitively dead can be removed since leaf values will be
++    processed first.
++  }];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
@@ -2639,83 +2578,1252 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_quantized.mlir
-@@ -821,75 +821,3 @@
-   %0 = "stablehlo.uniform_dequantize"(%arg0) : (tensor<4x!quant.uniform<si8:f32, 1.000000e+00>>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
- }
--
--// -----
--
--func.func @convolution_c28(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{not all of operands and result are quantized}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207xf32>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c29(%arg0: tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operand storage types 'i16' and 'i8'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i16:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c30(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{mismatched operands and result expressed types}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f64, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32, 5.0:20>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
--
--// -----
--
--func.func @convolution_c31(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>> {
--  // expected-error@+1 {{rhs and result are of mixed per_tensor and per_axis quantized tensor type 'tensor<3x3x207x16x!quant.uniform<i8:f32:0, {1.000000e-01:-30}>>' and 'tensor<1x8x8x16x!quant.uniform<i8:f32, 1.000000e+01:50>>'}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32, 10.0:50>>
--}
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+--- stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
++++ stablehlo/stablehlo/experimental/transforms/StablehloTrivialDce.cpp
+@@ -0,0 +1,63 @@
++/* Copyright 2022 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_STABLEHLOTRIVIALDCEPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++struct StablehloTrivialDcePass
++    : public impl::StablehloTrivialDcePassBase<StablehloTrivialDcePass> {
++  using StablehloTrivialDcePassBase::StablehloTrivialDcePassBase;
++
++  void runOnOperation() override {
++    GreedyRewriteConfig config;
++
++    // Hardcode defaults for stability.
++    config.enableRegionSimplification = true;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::AnyOp;
++
++    // Run a single bottom up pass.
++    config.useTopDownTraversal = false;
++    config.maxIterations = 1;
++
++    // Running a greedy rewrite will cause trivially dead values to be removed.
++    // Doing it without patterns ensures that no other changes are made to the
++    // IR. Doing it bottom-up ensures that values that are transitively dead are
++    // also removed. Although 1 pass should be enough,
++    // applyPatternsAndFoldGreedily will want to run at least 1 more iteration
++    // to confirm convergence, but we don't need to check for convergence, so we
++    // ignore the return value.
++    (void)applyPatternsAndFoldGreedily(getOperation(), RewritePatternSet(&getContext()), config);
++  }
++};
++
++}  // namespace
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -1283,153 +1283,153 @@
+ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
+   // CHECK: %[[TMP_0:.*]] = stablehlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
+   // CHECK: %[[TMP_1:.*]] = stablehlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
+-  // CHECK: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_3:.*]] = stablehlo.negate %[[TMP_0]]
+-  // CHECK: %[[TMP_4:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_3]]
+-  // CHECK: %[[TMP_5:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_5]]
+-  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_3]]
+-  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_4]], %[[TMP_7]]
+-  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_5]]
+-  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_3]]
++  // CHECK-DAG: %[[TMP_2:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_3:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_4:.*]] = stablehlo.negate %[[TMP_0]]
++  // CHECK: %[[TMP_5:.*]] = stablehlo.power %[[TMP_1]], %[[TMP_4]]
++  // CHECK: %[[TMP_6:.*]] = stablehlo.add %[[TMP_1]], %[[TMP_3]]
++  // CHECK: %[[TMP_7:.*]] = stablehlo.power %[[TMP_6]], %[[TMP_4]]
++  // CHECK: %[[TMP_8:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_7]]
++  // CHECK: %[[TMP_9:.*]] = stablehlo.add %[[TMP_6]], %[[TMP_3]]
++  // CHECK: %[[TMP_10:.*]] = stablehlo.power %[[TMP_9]], %[[TMP_4]]
+   // CHECK: %[[TMP_11:.*]] = stablehlo.add %[[TMP_8]], %[[TMP_10]]
+-  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_5]]
+-  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_12:.*]] = stablehlo.add %[[TMP_9]], %[[TMP_3]]
++  // CHECK: %[[TMP_13:.*]] = stablehlo.power %[[TMP_12]], %[[TMP_4]]
+   // CHECK: %[[TMP_14:.*]] = stablehlo.add %[[TMP_11]], %[[TMP_13]]
+-  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_5]]
+-  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_15:.*]] = stablehlo.add %[[TMP_12]], %[[TMP_3]]
++  // CHECK: %[[TMP_16:.*]] = stablehlo.power %[[TMP_15]], %[[TMP_4]]
+   // CHECK: %[[TMP_17:.*]] = stablehlo.add %[[TMP_14]], %[[TMP_16]]
+-  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_5]]
+-  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_18:.*]] = stablehlo.add %[[TMP_15]], %[[TMP_3]]
++  // CHECK: %[[TMP_19:.*]] = stablehlo.power %[[TMP_18]], %[[TMP_4]]
+   // CHECK: %[[TMP_20:.*]] = stablehlo.add %[[TMP_17]], %[[TMP_19]]
+-  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_5]]
+-  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_21:.*]] = stablehlo.add %[[TMP_18]], %[[TMP_3]]
++  // CHECK: %[[TMP_22:.*]] = stablehlo.power %[[TMP_21]], %[[TMP_4]]
+   // CHECK: %[[TMP_23:.*]] = stablehlo.add %[[TMP_20]], %[[TMP_22]]
+-  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_5]]
+-  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_24:.*]] = stablehlo.add %[[TMP_21]], %[[TMP_3]]
++  // CHECK: %[[TMP_25:.*]] = stablehlo.power %[[TMP_24]], %[[TMP_4]]
+   // CHECK: %[[TMP_26:.*]] = stablehlo.add %[[TMP_23]], %[[TMP_25]]
+-  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_5]]
+-  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_27:.*]] = stablehlo.add %[[TMP_24]], %[[TMP_3]]
++  // CHECK: %[[TMP_28:.*]] = stablehlo.power %[[TMP_27]], %[[TMP_4]]
+   // CHECK: %[[TMP_29:.*]] = stablehlo.add %[[TMP_26]], %[[TMP_28]]
+-  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_5]]
+-  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_30:.*]] = stablehlo.add %[[TMP_27]], %[[TMP_3]]
++  // CHECK: %[[TMP_31:.*]] = stablehlo.power %[[TMP_30]], %[[TMP_4]]
+   // CHECK: %[[TMP_32:.*]] = stablehlo.add %[[TMP_29]], %[[TMP_31]]
+-  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_5]]
+-  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_3]]
++  // CHECK: %[[TMP_33:.*]] = stablehlo.add %[[TMP_30]], %[[TMP_3]]
++  // CHECK: %[[TMP_34:.*]] = stablehlo.power %[[TMP_33]], %[[TMP_4]]
+   // CHECK: %[[TMP_35:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_36:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
+-  // CHECK: %[[TMP_37:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
+-  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_37]], %[[TMP_36]]
+-  // CHECK: %[[TMP_39:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
+-  // CHECK: %[[TMP_40:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
+-  // CHECK: %[[TMP_41:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_40]]
+-  // CHECK: %[[TMP_42:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_43:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_42]]
+-  // CHECK: %[[TMP_44:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_45:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_44]]
+-  // CHECK: %[[TMP_46:.*]] = stablehlo.multiply %[[TMP_43]], %[[TMP_45]]
+-  // CHECK: %[[TMP_47:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_48:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_47]]
+-  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_48]]
+-  // CHECK: %[[TMP_50:.*]] = stablehlo.multiply %[[TMP_46]], %[[TMP_49]]
+-  // CHECK: %[[TMP_51:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_52:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_51]]
+-  // CHECK: %[[TMP_53:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_54:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_53]]
+-  // CHECK: %[[TMP_55:.*]] = stablehlo.multiply %[[TMP_52]], %[[TMP_54]]
+-  // CHECK: %[[TMP_56:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_57:.*]] = stablehlo.add %[[TMP_50]], %[[TMP_56]]
+-  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_57]]
+-  // CHECK: %[[TMP_59:.*]] = stablehlo.multiply %[[TMP_55]], %[[TMP_58]]
+-  // CHECK: %[[TMP_60:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_61:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_60]]
+-  // CHECK: %[[TMP_62:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_63:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_62]]
+-  // CHECK: %[[TMP_64:.*]] = stablehlo.multiply %[[TMP_61]], %[[TMP_63]]
+-  // CHECK: %[[TMP_65:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_66:.*]] = stablehlo.add %[[TMP_59]], %[[TMP_65]]
+-  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_66]]
+-  // CHECK: %[[TMP_68:.*]] = stablehlo.multiply %[[TMP_64]], %[[TMP_67]]
+-  // CHECK: %[[TMP_69:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_70:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_69]]
+-  // CHECK: %[[TMP_71:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_72:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_71]]
+-  // CHECK: %[[TMP_73:.*]] = stablehlo.multiply %[[TMP_70]], %[[TMP_72]]
+-  // CHECK: %[[TMP_74:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_75:.*]] = stablehlo.add %[[TMP_68]], %[[TMP_74]]
+-  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_75]]
+-  // CHECK: %[[TMP_77:.*]] = stablehlo.multiply %[[TMP_73]], %[[TMP_76]]
+-  // CHECK: %[[TMP_78:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_79:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_78]]
+-  // CHECK: %[[TMP_80:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_81:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_80]]
+-  // CHECK: %[[TMP_82:.*]] = stablehlo.multiply %[[TMP_79]], %[[TMP_81]]
+-  // CHECK: %[[TMP_83:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_84:.*]] = stablehlo.add %[[TMP_77]], %[[TMP_83]]
+-  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_84]]
+-  // CHECK: %[[TMP_86:.*]] = stablehlo.multiply %[[TMP_82]], %[[TMP_85]]
+-  // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_88:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_87]]
+-  // CHECK: %[[TMP_89:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_89]]
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.multiply %[[TMP_88]], %[[TMP_90]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.add %[[TMP_86]], %[[TMP_92]]
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.multiply %[[TMP_91]], %[[TMP_94]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_96]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.multiply %[[TMP_97]], %[[TMP_99]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_95]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_102]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.multiply %[[TMP_100]], %[[TMP_103]]
+-  // CHECK: %[[TMP_105:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_105]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.multiply %[[TMP_106]], %[[TMP_108]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_104]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_111]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.multiply %[[TMP_109]], %[[TMP_112]]
+-  // CHECK: %[[TMP_114:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_114]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.multiply %[[TMP_115]], %[[TMP_117]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_113]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_120]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.multiply %[[TMP_118]], %[[TMP_121]]
+-  // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_125]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_124]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.add %[[TMP_122]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_129]]
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.multiply %[[TMP_127]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_134]]
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.add %[[TMP_131]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_41]], %[[TMP_138]]
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.multiply %[[TMP_136]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_143]], %[[TMP_140]]
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.add %[[TMP_141]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_39]], %[[TMP_147]]
++  // CHECK: %[[TMP_36:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_33]]
++  // CHECK: %[[TMP_37:.*]] = stablehlo.subtract %[[TMP_0]], %[[TMP_35]]
++  // CHECK: %[[TMP_38:.*]] = stablehlo.divide %[[TMP_36]], %[[TMP_37]]
++  // CHECK: %[[TMP_39:.*]] = stablehlo.multiply %[[TMP_33]], %[[TMP_33]]
++  // CHECK: %[[TMP_40:.*]] = stablehlo.divide %[[TMP_3]], %[[TMP_39]]
++  // CHECK: %[[TMP_41:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_42:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_41]]
++  // CHECK: %[[TMP_43:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_44:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_43]]
++  // CHECK: %[[TMP_45:.*]] = stablehlo.multiply %[[TMP_42]], %[[TMP_44]]
++  // CHECK: %[[TMP_46:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_47:.*]] = stablehlo.add %[[TMP_2]], %[[TMP_46]]
++  // CHECK: %[[TMP_48:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_47]]
++  // CHECK: %[[TMP_49:.*]] = stablehlo.multiply %[[TMP_45]], %[[TMP_48]]
++  // CHECK: %[[TMP_50:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_51:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_50]]
++  // CHECK: %[[TMP_52:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_53:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_52]]
++  // CHECK: %[[TMP_54:.*]] = stablehlo.multiply %[[TMP_51]], %[[TMP_53]]
++  // CHECK: %[[TMP_55:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_56:.*]] = stablehlo.add %[[TMP_49]], %[[TMP_55]]
++  // CHECK: %[[TMP_57:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_56]]
++  // CHECK: %[[TMP_58:.*]] = stablehlo.multiply %[[TMP_54]], %[[TMP_57]]
++  // CHECK: %[[TMP_59:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_60:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_59]]
++  // CHECK: %[[TMP_61:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_62:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_61]]
++  // CHECK: %[[TMP_63:.*]] = stablehlo.multiply %[[TMP_60]], %[[TMP_62]]
++  // CHECK: %[[TMP_64:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_65:.*]] = stablehlo.add %[[TMP_58]], %[[TMP_64]]
++  // CHECK: %[[TMP_66:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_65]]
++  // CHECK: %[[TMP_67:.*]] = stablehlo.multiply %[[TMP_63]], %[[TMP_66]]
++  // CHECK: %[[TMP_68:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_69:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_68]]
++  // CHECK: %[[TMP_70:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_71:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_70]]
++  // CHECK: %[[TMP_72:.*]] = stablehlo.multiply %[[TMP_69]], %[[TMP_71]]
++  // CHECK: %[[TMP_73:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_74:.*]] = stablehlo.add %[[TMP_67]], %[[TMP_73]]
++  // CHECK: %[[TMP_75:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_74]]
++  // CHECK: %[[TMP_76:.*]] = stablehlo.multiply %[[TMP_72]], %[[TMP_75]]
++  // CHECK: %[[TMP_77:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_78:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_77]]
++  // CHECK: %[[TMP_79:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_80:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_79]]
++  // CHECK: %[[TMP_81:.*]] = stablehlo.multiply %[[TMP_78]], %[[TMP_80]]
++  // CHECK: %[[TMP_82:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_83:.*]] = stablehlo.add %[[TMP_76]], %[[TMP_82]]
++  // CHECK: %[[TMP_84:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_83]]
++  // CHECK: %[[TMP_85:.*]] = stablehlo.multiply %[[TMP_81]], %[[TMP_84]]
++  // CHECK: %[[TMP_86:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_87:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_86]]
++  // CHECK: %[[TMP_88:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_89:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_88]]
++  // CHECK: %[[TMP_90:.*]] = stablehlo.multiply %[[TMP_87]], %[[TMP_89]]
++  // CHECK: %[[TMP_91:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.add %[[TMP_85]], %[[TMP_91]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.multiply %[[TMP_90]], %[[TMP_93]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_98:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_97]]
++  // CHECK: %[[TMP_99:.*]] = stablehlo.multiply %[[TMP_96]], %[[TMP_98]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_101:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_100]]
++  // CHECK: %[[TMP_102:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_101]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.multiply %[[TMP_99]], %[[TMP_102]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_104]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_107:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_106]]
++  // CHECK: %[[TMP_108:.*]] = stablehlo.multiply %[[TMP_105]], %[[TMP_107]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_110:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_109]]
++  // CHECK: %[[TMP_111:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_110]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.multiply %[[TMP_108]], %[[TMP_111]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_113]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_116:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_115]]
++  // CHECK: %[[TMP_117:.*]] = stablehlo.multiply %[[TMP_114]], %[[TMP_116]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_119:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_118]]
++  // CHECK: %[[TMP_120:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_119]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.multiply %[[TMP_117]], %[[TMP_120]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_123:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_122]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_125:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_124]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.multiply %[[TMP_123]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_128:.*]] = stablehlo.add %[[TMP_121]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_128]]
++  // CHECK: %[[TMP_130:.*]] = stablehlo.multiply %[[TMP_126]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_134:.*]] = stablehlo.add %[[TMP_0]], %[[TMP_133]]
++  // CHECK: %[[TMP_135:.*]] = stablehlo.multiply %[[TMP_132]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_137:.*]] = stablehlo.add %[[TMP_130]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_40]], %[[TMP_137]]
++  // CHECK: %[[TMP_139:.*]] = stablehlo.multiply %[[TMP_135]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.divide %[[TMP_0]], %[[TMP_33]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_143:.*]] = stablehlo.add %[[TMP_142]], %[[TMP_139]]
++  // CHECK: %[[TMP_144:.*]] = stablehlo.multiply %[[TMP_141]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_140]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_34]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.add %[[TMP_32]], %[[TMP_38]]
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_146]]
+   // CHECK: %[[TMP_149:.*]] = stablehlo.abs %[[TMP_34]]
+   // CHECK: %[[TMP_150:.*]] = stablehlo.abs %[[TMP_32]]
+   // CHECK: %[[TMP_151:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1456,7 +1456,7 @@
+   // CHECK: %[[TMP_172:.*]] = stablehlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
+   // CHECK: %[[TMP_173:.*]] = stablehlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
+   // CHECK: %[[TMP_174:.*]] = stablehlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
++  // CHECK: %[[TMP_175:.*]] = stablehlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
+   // CHECK: %[[TMP_176:.*]] = stablehlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
+   // CHECK: %[[TMP_177:.*]] = stablehlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
+   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
+@@ -1465,8 +1465,7 @@
+ 
+ // -----
+ 
 -
--// -----
+-// CHECK-LABEL: @polygamma_f32
++// CHECK: @polygamma_f32
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>)
+ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1559,153 +1558,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7F800000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.39544646E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.50900303E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.17486866E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.58606213E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896803E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.33825364E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.28419031E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767563E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.26719599E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.30687835E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.00138888892>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.0833333358>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.39544646E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.50900303E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.17486866E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.58606213E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896803E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.33825364E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.28419031E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767563E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.26719599E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.30687835E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.00138888892>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.0833333358>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<1.401300e-45>
+@@ -1732,7 +1731,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+@@ -1853,8 +1852,7 @@
+ 
+ // -----
+ 
 -
--func.func @convolution_c32(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>> {
--  // expected-error@+1 {{mismatched kernel_output_feature_dimension 3 and rhs quantized dimension 0}}
--  %0 = stablehlo.convolution(%arg0, %arg1)
--         dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--         window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--         {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--       (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:0, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--  func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {0.1:-30}>>
--}
+-// CHECK-LABEL: @polygamma_f64
++// CHECK: @polygamma_f64
+ // CHECK-SAME: (%[[ARG0:.*]]: tensor<f64>, %[[ARG1:.*]]: tensor<f64>)
+ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64> {
+   // CHECK-DAG: %[[TMP_0:.*]] = stablehlo.constant dense<1.000000e+00>
+@@ -1947,153 +1945,153 @@
+   // CHECK: %[[TMP_87:.*]] = stablehlo.constant dense<0x7FF0000000000000>
+   // CHECK: %[[TMP_88:.*]] = stablehlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
+   // CHECK: %[[TMP_89:.*]] = stablehlo.exponential %[[TMP_88]]
+-  // CHECK: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
+-  // CHECK: %[[TMP_91:.*]] = stablehlo.negate %[[TMP_5]]
+-  // CHECK: %[[TMP_92:.*]] = stablehlo.power %[[ARG1]], %[[TMP_91]]
+-  // CHECK: %[[TMP_93:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_93]]
+-  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_91]]
+-  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_92]], %[[TMP_95]]
+-  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_93]]
+-  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_91]]
++  // CHECK-DAG: %[[TMP_90:.*]] = stablehlo.constant dense<0.000000e+00>
++  // CHECK-DAG: %[[TMP_91:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_92:.*]] = stablehlo.negate %[[TMP_5]]
++  // CHECK: %[[TMP_93:.*]] = stablehlo.power %[[ARG1]], %[[TMP_92]]
++  // CHECK: %[[TMP_94:.*]] = stablehlo.add %[[ARG1]], %[[TMP_91]]
++  // CHECK: %[[TMP_95:.*]] = stablehlo.power %[[TMP_94]], %[[TMP_92]]
++  // CHECK: %[[TMP_96:.*]] = stablehlo.add %[[TMP_93]], %[[TMP_95]]
++  // CHECK: %[[TMP_97:.*]] = stablehlo.add %[[TMP_94]], %[[TMP_91]]
++  // CHECK: %[[TMP_98:.*]] = stablehlo.power %[[TMP_97]], %[[TMP_92]]
+   // CHECK: %[[TMP_99:.*]] = stablehlo.add %[[TMP_96]], %[[TMP_98]]
+-  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_93]]
+-  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_100:.*]] = stablehlo.add %[[TMP_97]], %[[TMP_91]]
++  // CHECK: %[[TMP_101:.*]] = stablehlo.power %[[TMP_100]], %[[TMP_92]]
+   // CHECK: %[[TMP_102:.*]] = stablehlo.add %[[TMP_99]], %[[TMP_101]]
+-  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_93]]
+-  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_103:.*]] = stablehlo.add %[[TMP_100]], %[[TMP_91]]
++  // CHECK: %[[TMP_104:.*]] = stablehlo.power %[[TMP_103]], %[[TMP_92]]
+   // CHECK: %[[TMP_105:.*]] = stablehlo.add %[[TMP_102]], %[[TMP_104]]
+-  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_93]]
+-  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_106:.*]] = stablehlo.add %[[TMP_103]], %[[TMP_91]]
++  // CHECK: %[[TMP_107:.*]] = stablehlo.power %[[TMP_106]], %[[TMP_92]]
+   // CHECK: %[[TMP_108:.*]] = stablehlo.add %[[TMP_105]], %[[TMP_107]]
+-  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_93]]
+-  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_109:.*]] = stablehlo.add %[[TMP_106]], %[[TMP_91]]
++  // CHECK: %[[TMP_110:.*]] = stablehlo.power %[[TMP_109]], %[[TMP_92]]
+   // CHECK: %[[TMP_111:.*]] = stablehlo.add %[[TMP_108]], %[[TMP_110]]
+-  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_93]]
+-  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_112:.*]] = stablehlo.add %[[TMP_109]], %[[TMP_91]]
++  // CHECK: %[[TMP_113:.*]] = stablehlo.power %[[TMP_112]], %[[TMP_92]]
+   // CHECK: %[[TMP_114:.*]] = stablehlo.add %[[TMP_111]], %[[TMP_113]]
+-  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_93]]
+-  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_115:.*]] = stablehlo.add %[[TMP_112]], %[[TMP_91]]
++  // CHECK: %[[TMP_116:.*]] = stablehlo.power %[[TMP_115]], %[[TMP_92]]
+   // CHECK: %[[TMP_117:.*]] = stablehlo.add %[[TMP_114]], %[[TMP_116]]
+-  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_93]]
+-  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_118:.*]] = stablehlo.add %[[TMP_115]], %[[TMP_91]]
++  // CHECK: %[[TMP_119:.*]] = stablehlo.power %[[TMP_118]], %[[TMP_92]]
+   // CHECK: %[[TMP_120:.*]] = stablehlo.add %[[TMP_117]], %[[TMP_119]]
+-  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_93]]
+-  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_91]]
++  // CHECK: %[[TMP_121:.*]] = stablehlo.add %[[TMP_118]], %[[TMP_91]]
++  // CHECK: %[[TMP_122:.*]] = stablehlo.power %[[TMP_121]], %[[TMP_92]]
+   // CHECK: %[[TMP_123:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_124:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
+-  // CHECK: %[[TMP_125:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
+-  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_125]], %[[TMP_124]]
+-  // CHECK: %[[TMP_127:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
+-  // CHECK: %[[TMP_128:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
+-  // CHECK: %[[TMP_129:.*]] = stablehlo.divide %[[TMP_93]], %[[TMP_128]]
+-  // CHECK: %[[TMP_130:.*]] = stablehlo.constant dense<2.200000e+01>
+-  // CHECK: %[[TMP_131:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_130]]
+-  // CHECK: %[[TMP_132:.*]] = stablehlo.constant dense<2.100000e+01>
+-  // CHECK: %[[TMP_133:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_132]]
+-  // CHECK: %[[TMP_134:.*]] = stablehlo.multiply %[[TMP_131]], %[[TMP_133]]
+-  // CHECK: %[[TMP_135:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
+-  // CHECK: %[[TMP_136:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_135]]
+-  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_136]]
+-  // CHECK: %[[TMP_138:.*]] = stablehlo.multiply %[[TMP_134]], %[[TMP_137]]
+-  // CHECK: %[[TMP_139:.*]] = stablehlo.constant dense<2.000000e+01>
+-  // CHECK: %[[TMP_140:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_139]]
+-  // CHECK: %[[TMP_141:.*]] = stablehlo.constant dense<1.900000e+01>
+-  // CHECK: %[[TMP_142:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_141]]
+-  // CHECK: %[[TMP_143:.*]] = stablehlo.multiply %[[TMP_140]], %[[TMP_142]]
+-  // CHECK: %[[TMP_144:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
+-  // CHECK: %[[TMP_145:.*]] = stablehlo.add %[[TMP_138]], %[[TMP_144]]
+-  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_145]]
+-  // CHECK: %[[TMP_147:.*]] = stablehlo.multiply %[[TMP_143]], %[[TMP_146]]
+-  // CHECK: %[[TMP_148:.*]] = stablehlo.constant dense<1.800000e+01>
+-  // CHECK: %[[TMP_149:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_148]]
+-  // CHECK: %[[TMP_150:.*]] = stablehlo.constant dense<1.700000e+01>
+-  // CHECK: %[[TMP_151:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_150]]
+-  // CHECK: %[[TMP_152:.*]] = stablehlo.multiply %[[TMP_149]], %[[TMP_151]]
+-  // CHECK: %[[TMP_153:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
+-  // CHECK: %[[TMP_154:.*]] = stablehlo.add %[[TMP_147]], %[[TMP_153]]
+-  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_154]]
+-  // CHECK: %[[TMP_156:.*]] = stablehlo.multiply %[[TMP_152]], %[[TMP_155]]
+-  // CHECK: %[[TMP_157:.*]] = stablehlo.constant dense<1.600000e+01>
+-  // CHECK: %[[TMP_158:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_157]]
+-  // CHECK: %[[TMP_159:.*]] = stablehlo.constant dense<1.500000e+01>
+-  // CHECK: %[[TMP_160:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_159]]
+-  // CHECK: %[[TMP_161:.*]] = stablehlo.multiply %[[TMP_158]], %[[TMP_160]]
+-  // CHECK: %[[TMP_162:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
+-  // CHECK: %[[TMP_163:.*]] = stablehlo.add %[[TMP_156]], %[[TMP_162]]
+-  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_163]]
+-  // CHECK: %[[TMP_165:.*]] = stablehlo.multiply %[[TMP_161]], %[[TMP_164]]
+-  // CHECK: %[[TMP_166:.*]] = stablehlo.constant dense<1.400000e+01>
+-  // CHECK: %[[TMP_167:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_166]]
+-  // CHECK: %[[TMP_168:.*]] = stablehlo.constant dense<1.300000e+01>
+-  // CHECK: %[[TMP_169:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_168]]
+-  // CHECK: %[[TMP_170:.*]] = stablehlo.multiply %[[TMP_167]], %[[TMP_169]]
+-  // CHECK: %[[TMP_171:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
+-  // CHECK: %[[TMP_172:.*]] = stablehlo.add %[[TMP_165]], %[[TMP_171]]
+-  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_172]]
+-  // CHECK: %[[TMP_174:.*]] = stablehlo.multiply %[[TMP_170]], %[[TMP_173]]
+-  // CHECK: %[[TMP_175:.*]] = stablehlo.constant dense<1.200000e+01>
+-  // CHECK: %[[TMP_176:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_175]]
+-  // CHECK: %[[TMP_177:.*]] = stablehlo.constant dense<1.100000e+01>
+-  // CHECK: %[[TMP_178:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_177]]
+-  // CHECK: %[[TMP_179:.*]] = stablehlo.multiply %[[TMP_176]], %[[TMP_178]]
+-  // CHECK: %[[TMP_180:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
+-  // CHECK: %[[TMP_181:.*]] = stablehlo.add %[[TMP_174]], %[[TMP_180]]
+-  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_181]]
+-  // CHECK: %[[TMP_183:.*]] = stablehlo.multiply %[[TMP_179]], %[[TMP_182]]
+-  // CHECK: %[[TMP_184:.*]] = stablehlo.constant dense<1.000000e+01>
+-  // CHECK: %[[TMP_185:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_184]]
+-  // CHECK: %[[TMP_186:.*]] = stablehlo.constant dense<9.000000e+00>
+-  // CHECK: %[[TMP_187:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_186]]
+-  // CHECK: %[[TMP_188:.*]] = stablehlo.multiply %[[TMP_185]], %[[TMP_187]]
+-  // CHECK: %[[TMP_189:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
+-  // CHECK: %[[TMP_190:.*]] = stablehlo.add %[[TMP_183]], %[[TMP_189]]
+-  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_190]]
+-  // CHECK: %[[TMP_192:.*]] = stablehlo.multiply %[[TMP_188]], %[[TMP_191]]
+-  // CHECK: %[[TMP_193:.*]] = stablehlo.constant dense<8.000000e+00>
+-  // CHECK: %[[TMP_194:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_193]]
+-  // CHECK: %[[TMP_195:.*]] = stablehlo.constant dense<7.000000e+00>
+-  // CHECK: %[[TMP_196:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_195]]
+-  // CHECK: %[[TMP_197:.*]] = stablehlo.multiply %[[TMP_194]], %[[TMP_196]]
+-  // CHECK: %[[TMP_198:.*]] = stablehlo.constant dense<2.08767569878681E-8>
+-  // CHECK: %[[TMP_199:.*]] = stablehlo.add %[[TMP_192]], %[[TMP_198]]
+-  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_199]]
+-  // CHECK: %[[TMP_201:.*]] = stablehlo.multiply %[[TMP_197]], %[[TMP_200]]
+-  // CHECK: %[[TMP_202:.*]] = stablehlo.constant dense<6.000000e+00>
+-  // CHECK: %[[TMP_203:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_202]]
+-  // CHECK: %[[TMP_204:.*]] = stablehlo.constant dense<5.000000e+00>
+-  // CHECK: %[[TMP_205:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_204]]
+-  // CHECK: %[[TMP_206:.*]] = stablehlo.multiply %[[TMP_203]], %[[TMP_205]]
+-  // CHECK: %[[TMP_207:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
+-  // CHECK: %[[TMP_208:.*]] = stablehlo.add %[[TMP_201]], %[[TMP_207]]
+-  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_208]]
+-  // CHECK: %[[TMP_210:.*]] = stablehlo.multiply %[[TMP_206]], %[[TMP_209]]
+-  // CHECK: %[[TMP_211:.*]] = stablehlo.constant dense<4.000000e+00>
+-  // CHECK: %[[TMP_212:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_211]]
+-  // CHECK: %[[TMP_213:.*]] = stablehlo.constant dense<3.000000e+00>
+-  // CHECK: %[[TMP_214:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_213]]
+-  // CHECK: %[[TMP_215:.*]] = stablehlo.multiply %[[TMP_212]], %[[TMP_214]]
+-  // CHECK: %[[TMP_216:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
+-  // CHECK: %[[TMP_217:.*]] = stablehlo.add %[[TMP_210]], %[[TMP_216]]
+-  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_217]]
+-  // CHECK: %[[TMP_219:.*]] = stablehlo.multiply %[[TMP_215]], %[[TMP_218]]
+-  // CHECK: %[[TMP_220:.*]] = stablehlo.constant dense<2.000000e+00>
+-  // CHECK: %[[TMP_221:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_220]]
+-  // CHECK: %[[TMP_222:.*]] = stablehlo.constant dense<1.000000e+00>
+-  // CHECK: %[[TMP_223:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_222]]
+-  // CHECK: %[[TMP_224:.*]] = stablehlo.multiply %[[TMP_221]], %[[TMP_223]]
+-  // CHECK: %[[TMP_225:.*]] = stablehlo.constant dense<-0.0013888888888888889>
+-  // CHECK: %[[TMP_226:.*]] = stablehlo.add %[[TMP_219]], %[[TMP_225]]
+-  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_129]], %[[TMP_226]]
+-  // CHECK: %[[TMP_228:.*]] = stablehlo.multiply %[[TMP_224]], %[[TMP_227]]
+-  // CHECK: %[[TMP_229:.*]] = stablehlo.constant dense<5.000000e-01>
+-  // CHECK: %[[TMP_230:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
+-  // CHECK: %[[TMP_231:.*]] = stablehlo.constant dense<0.083333333333333329>
+-  // CHECK: %[[TMP_232:.*]] = stablehlo.add %[[TMP_231]], %[[TMP_228]]
+-  // CHECK: %[[TMP_233:.*]] = stablehlo.multiply %[[TMP_230]], %[[TMP_232]]
+-  // CHECK: %[[TMP_234:.*]] = stablehlo.add %[[TMP_229]], %[[TMP_233]]
+-  // CHECK: %[[TMP_235:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_234]]
+-  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_127]], %[[TMP_235]]
++  // CHECK: %[[TMP_124:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_121]]
++  // CHECK: %[[TMP_125:.*]] = stablehlo.subtract %[[TMP_5]], %[[TMP_123]]
++  // CHECK: %[[TMP_126:.*]] = stablehlo.divide %[[TMP_124]], %[[TMP_125]]
++  // CHECK: %[[TMP_127:.*]] = stablehlo.multiply %[[TMP_121]], %[[TMP_121]]
++  // CHECK: %[[TMP_128:.*]] = stablehlo.divide %[[TMP_91]], %[[TMP_127]]
++  // CHECK: %[[TMP_129:.*]] = stablehlo.constant dense<2.200000e+01>
++  // CHECK: %[[TMP_130:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_129]]
++  // CHECK: %[[TMP_131:.*]] = stablehlo.constant dense<2.100000e+01>
++  // CHECK: %[[TMP_132:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_131]]
++  // CHECK: %[[TMP_133:.*]] = stablehlo.multiply %[[TMP_130]], %[[TMP_132]]
++  // CHECK: %[[TMP_134:.*]] = stablehlo.constant dense<-1.3954464685812522E-19>
++  // CHECK: %[[TMP_135:.*]] = stablehlo.add %[[TMP_90]], %[[TMP_134]]
++  // CHECK: %[[TMP_136:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_135]]
++  // CHECK: %[[TMP_137:.*]] = stablehlo.multiply %[[TMP_133]], %[[TMP_136]]
++  // CHECK: %[[TMP_138:.*]] = stablehlo.constant dense<2.000000e+01>
++  // CHECK: %[[TMP_139:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_138]]
++  // CHECK: %[[TMP_140:.*]] = stablehlo.constant dense<1.900000e+01>
++  // CHECK: %[[TMP_141:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_140]]
++  // CHECK: %[[TMP_142:.*]] = stablehlo.multiply %[[TMP_139]], %[[TMP_141]]
++  // CHECK: %[[TMP_143:.*]] = stablehlo.constant dense<5.5090028283602295E-18>
++  // CHECK: %[[TMP_144:.*]] = stablehlo.add %[[TMP_137]], %[[TMP_143]]
++  // CHECK: %[[TMP_145:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_144]]
++  // CHECK: %[[TMP_146:.*]] = stablehlo.multiply %[[TMP_142]], %[[TMP_145]]
++  // CHECK: %[[TMP_147:.*]] = stablehlo.constant dense<1.800000e+01>
++  // CHECK: %[[TMP_148:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_147]]
++  // CHECK: %[[TMP_149:.*]] = stablehlo.constant dense<1.700000e+01>
++  // CHECK: %[[TMP_150:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_149]]
++  // CHECK: %[[TMP_151:.*]] = stablehlo.multiply %[[TMP_148]], %[[TMP_150]]
++  // CHECK: %[[TMP_152:.*]] = stablehlo.constant dense<-2.1748686985580617E-16>
++  // CHECK: %[[TMP_153:.*]] = stablehlo.add %[[TMP_146]], %[[TMP_152]]
++  // CHECK: %[[TMP_154:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_153]]
++  // CHECK: %[[TMP_155:.*]] = stablehlo.multiply %[[TMP_151]], %[[TMP_154]]
++  // CHECK: %[[TMP_156:.*]] = stablehlo.constant dense<1.600000e+01>
++  // CHECK: %[[TMP_157:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_156]]
++  // CHECK: %[[TMP_158:.*]] = stablehlo.constant dense<1.500000e+01>
++  // CHECK: %[[TMP_159:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_158]]
++  // CHECK: %[[TMP_160:.*]] = stablehlo.multiply %[[TMP_157]], %[[TMP_159]]
++  // CHECK: %[[TMP_161:.*]] = stablehlo.constant dense<8.5860620562778452E-15>
++  // CHECK: %[[TMP_162:.*]] = stablehlo.add %[[TMP_155]], %[[TMP_161]]
++  // CHECK: %[[TMP_163:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_162]]
++  // CHECK: %[[TMP_164:.*]] = stablehlo.multiply %[[TMP_160]], %[[TMP_163]]
++  // CHECK: %[[TMP_165:.*]] = stablehlo.constant dense<1.400000e+01>
++  // CHECK: %[[TMP_166:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_165]]
++  // CHECK: %[[TMP_167:.*]] = stablehlo.constant dense<1.300000e+01>
++  // CHECK: %[[TMP_168:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_167]]
++  // CHECK: %[[TMP_169:.*]] = stablehlo.multiply %[[TMP_166]], %[[TMP_168]]
++  // CHECK: %[[TMP_170:.*]] = stablehlo.constant dense<-3.3896802963225832E-13>
++  // CHECK: %[[TMP_171:.*]] = stablehlo.add %[[TMP_164]], %[[TMP_170]]
++  // CHECK: %[[TMP_172:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_171]]
++  // CHECK: %[[TMP_173:.*]] = stablehlo.multiply %[[TMP_169]], %[[TMP_172]]
++  // CHECK: %[[TMP_174:.*]] = stablehlo.constant dense<1.200000e+01>
++  // CHECK: %[[TMP_175:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_174]]
++  // CHECK: %[[TMP_176:.*]] = stablehlo.constant dense<1.100000e+01>
++  // CHECK: %[[TMP_177:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_176]]
++  // CHECK: %[[TMP_178:.*]] = stablehlo.multiply %[[TMP_175]], %[[TMP_177]]
++  // CHECK: %[[TMP_179:.*]] = stablehlo.constant dense<1.3382536530684679E-11>
++  // CHECK: %[[TMP_180:.*]] = stablehlo.add %[[TMP_173]], %[[TMP_179]]
++  // CHECK: %[[TMP_181:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_180]]
++  // CHECK: %[[TMP_182:.*]] = stablehlo.multiply %[[TMP_178]], %[[TMP_181]]
++  // CHECK: %[[TMP_183:.*]] = stablehlo.constant dense<1.000000e+01>
++  // CHECK: %[[TMP_184:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_183]]
++  // CHECK: %[[TMP_185:.*]] = stablehlo.constant dense<9.000000e+00>
++  // CHECK: %[[TMP_186:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_185]]
++  // CHECK: %[[TMP_187:.*]] = stablehlo.multiply %[[TMP_184]], %[[TMP_186]]
++  // CHECK: %[[TMP_188:.*]] = stablehlo.constant dense<-5.2841901386874932E-10>
++  // CHECK: %[[TMP_189:.*]] = stablehlo.add %[[TMP_182]], %[[TMP_188]]
++  // CHECK: %[[TMP_190:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_189]]
++  // CHECK: %[[TMP_191:.*]] = stablehlo.multiply %[[TMP_187]], %[[TMP_190]]
++  // CHECK: %[[TMP_192:.*]] = stablehlo.constant dense<8.000000e+00>
++  // CHECK: %[[TMP_193:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_192]]
++  // CHECK: %[[TMP_194:.*]] = stablehlo.constant dense<7.000000e+00>
++  // CHECK: %[[TMP_195:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_194]]
++  // CHECK: %[[TMP_196:.*]] = stablehlo.multiply %[[TMP_193]], %[[TMP_195]]
++  // CHECK: %[[TMP_197:.*]] = stablehlo.constant dense<2.08767569878681E-8>
++  // CHECK: %[[TMP_198:.*]] = stablehlo.add %[[TMP_191]], %[[TMP_197]]
++  // CHECK: %[[TMP_199:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_198]]
++  // CHECK: %[[TMP_200:.*]] = stablehlo.multiply %[[TMP_196]], %[[TMP_199]]
++  // CHECK: %[[TMP_201:.*]] = stablehlo.constant dense<6.000000e+00>
++  // CHECK: %[[TMP_202:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_201]]
++  // CHECK: %[[TMP_203:.*]] = stablehlo.constant dense<5.000000e+00>
++  // CHECK: %[[TMP_204:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_203]]
++  // CHECK: %[[TMP_205:.*]] = stablehlo.multiply %[[TMP_202]], %[[TMP_204]]
++  // CHECK: %[[TMP_206:.*]] = stablehlo.constant dense<-8.2671957671957675E-7>
++  // CHECK: %[[TMP_207:.*]] = stablehlo.add %[[TMP_200]], %[[TMP_206]]
++  // CHECK: %[[TMP_208:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_207]]
++  // CHECK: %[[TMP_209:.*]] = stablehlo.multiply %[[TMP_205]], %[[TMP_208]]
++  // CHECK: %[[TMP_210:.*]] = stablehlo.constant dense<4.000000e+00>
++  // CHECK: %[[TMP_211:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_210]]
++  // CHECK: %[[TMP_212:.*]] = stablehlo.constant dense<3.000000e+00>
++  // CHECK: %[[TMP_213:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_212]]
++  // CHECK: %[[TMP_214:.*]] = stablehlo.multiply %[[TMP_211]], %[[TMP_213]]
++  // CHECK: %[[TMP_215:.*]] = stablehlo.constant dense<3.3068783068783071E-5>
++  // CHECK: %[[TMP_216:.*]] = stablehlo.add %[[TMP_209]], %[[TMP_215]]
++  // CHECK: %[[TMP_217:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_216]]
++  // CHECK: %[[TMP_218:.*]] = stablehlo.multiply %[[TMP_214]], %[[TMP_217]]
++  // CHECK: %[[TMP_219:.*]] = stablehlo.constant dense<2.000000e+00>
++  // CHECK: %[[TMP_220:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_219]]
++  // CHECK: %[[TMP_221:.*]] = stablehlo.constant dense<1.000000e+00>
++  // CHECK: %[[TMP_222:.*]] = stablehlo.add %[[TMP_5]], %[[TMP_221]]
++  // CHECK: %[[TMP_223:.*]] = stablehlo.multiply %[[TMP_220]], %[[TMP_222]]
++  // CHECK: %[[TMP_224:.*]] = stablehlo.constant dense<-0.0013888888888888889>
++  // CHECK: %[[TMP_225:.*]] = stablehlo.add %[[TMP_218]], %[[TMP_224]]
++  // CHECK: %[[TMP_226:.*]] = stablehlo.multiply %[[TMP_128]], %[[TMP_225]]
++  // CHECK: %[[TMP_227:.*]] = stablehlo.multiply %[[TMP_223]], %[[TMP_226]]
++  // CHECK: %[[TMP_228:.*]] = stablehlo.constant dense<5.000000e-01>
++  // CHECK: %[[TMP_229:.*]] = stablehlo.divide %[[TMP_5]], %[[TMP_121]]
++  // CHECK: %[[TMP_230:.*]] = stablehlo.constant dense<0.083333333333333329>
++  // CHECK: %[[TMP_231:.*]] = stablehlo.add %[[TMP_230]], %[[TMP_227]]
++  // CHECK: %[[TMP_232:.*]] = stablehlo.multiply %[[TMP_229]], %[[TMP_231]]
++  // CHECK: %[[TMP_233:.*]] = stablehlo.add %[[TMP_228]], %[[TMP_232]]
++  // CHECK: %[[TMP_234:.*]] = stablehlo.multiply %[[TMP_122]], %[[TMP_233]]
++  // CHECK: %[[TMP_235:.*]] = stablehlo.add %[[TMP_120]], %[[TMP_126]]
++  // CHECK: %[[TMP_236:.*]] = stablehlo.add %[[TMP_235]], %[[TMP_234]]
+   // CHECK: %[[TMP_237:.*]] = stablehlo.abs %[[TMP_122]]
+   // CHECK: %[[TMP_238:.*]] = stablehlo.abs %[[TMP_120]]
+   // CHECK: %[[TMP_239:.*]] = stablehlo.constant dense<4.940660e-324>
+@@ -2120,7 +2118,7 @@
+   // CHECK: %[[TMP_260:.*]] = stablehlo.and %[[TMP_257]], %[[TMP_259]]
+   // CHECK: %[[TMP_261:.*]] = stablehlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
+   // CHECK: %[[TMP_262:.*]] = stablehlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
+-  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
++  // CHECK: %[[TMP_263:.*]] = stablehlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
+   // CHECK: %[[TMP_264:.*]] = stablehlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
+   // CHECK: %[[TMP_265:.*]] = stablehlo.multiply %[[TMP_4]], %[[TMP_89]]
+   // CHECK: %[[TMP_266:.*]] = stablehlo.multiply %[[TMP_265]], %[[TMP_264]]
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1575,11 +1575,21 @@
+ 
+ static Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
+                              ValueRange args) {
+-  // Code should match XLA's materializeZeta from chlo_legalize_to_hlo.cc
++  // Implementation ported from:
++  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
++  // Reference: Johansson, Fredrik.
++  // "Rigorous high-precision computation of the Hurwitz zeta function and its
++  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
++  // https://arxiv.org/abs/1309.2877 - formula (5)
++  // Notation is more or less kept as a reference to the whitepaper.
+   assert(args.size() == 2);
+   Value x = args[0];
+   Value q = args[1];
+-  static const std::array<double, 12> kZetaCoeffs{
++
++  static constexpr auto kTerms = 12;
++  static constexpr auto kIters = 9;
++  static constexpr auto kTwoTermsMinusOne = 2 * kTerms - 1;
++  static constexpr auto kZetaCoeffs = std::array<double, kTerms>{
+       -7.1661652561756670113e18,
+       1.8152105401943546773e17,
+       -4.5979787224074726105e15,
+@@ -1596,131 +1606,134 @@
+ 
+   // For speed we'll always use 9 iterations for the initial series estimate,
+   // and a 12 term expansion for the Euler-Maclaurin formula.
+-  Value a = q;
+-  Value zero = getConstantLike(rewriter, loc, 0.0, a);
+-  Value negPower = zero;
+-  Value negX = rewriter.create<mlir::stablehlo::NegOp>(loc, x);
+-  Value initialSum = rewriter.create<mlir::stablehlo::PowOp>(loc, q, negX);
+-  Value one = getConstantLike(rewriter, loc, 1.0, a);
+-  for (int i = 0; i < 9; ++i) {
+-    a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-    negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
+-    initialSum =
+-        rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum, negPower);
+-  }
 -
--// -----
+-  a = rewriter.create<mlir::stablehlo::AddOp>(loc, a, one);
+-  negPower = rewriter.create<mlir::stablehlo::PowOp>(loc, a, negX);
++  Value zero = getConstantLike(rewriter, loc, 0.0, q);
++  Value one = getConstantLike(rewriter, loc, 1.0, q);
++  Value acc = q;
++  Value qNegPower = zero;
++  Value negX = rewriter.create<NegOp>(loc, x);
++  Value powerSum = rewriter.create<PowOp>(loc, q, negX);
++  for (int i = 0; i < kIters; ++i) {
++    acc = rewriter.create<AddOp>(loc, acc, one);
++    qNegPower = rewriter.create<PowOp>(loc, acc, negX);
++    powerSum =
++        rewriter.create<AddOp>(loc, powerSum, qNegPower);
++  }
++  acc = rewriter.create<AddOp>(loc, acc, one);
++  qNegPower = rewriter.create<PowOp>(loc, acc, negX);
+   Value oneLikeX = getConstantLike(rewriter, loc, 1.0, x);
+-  Value xMinusOne =
+-      rewriter.create<mlir::stablehlo::SubtractOp>(loc, x, oneLikeX);
+-  Value negPowerMulA =
+-      rewriter.create<mlir::stablehlo::MulOp>(loc, negPower, a);
+-  Value negPowerMulADivXMinusOne =
+-      rewriter.create<mlir::stablehlo::DivOp>(loc, negPowerMulA, xMinusOne);
+-  Value s = rewriter.create<mlir::stablehlo::AddOp>(loc, initialSum,
+-                                                    negPowerMulADivXMinusOne);
+-  Value aInverseSquare = rewriter.create<mlir::stablehlo::DivOp>(
+-      loc, one, rewriter.create<mlir::stablehlo::MulOp>(loc, a, a));
 -
--func.func @convolution_c33(%arg0: tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, %arg1: tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>> {
-- // expected-error@+1 {{mismatched output_feature_dimension 3 and result quantized dimension 0}}
-- %0 = stablehlo.convolution(%arg0, %arg1)
--     dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
--     window = {stride = [1, 1], pad = [[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]}
--     {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} :
--    (tensor<1x8x8x207x!quant.uniform<i8:f32, 2.0:15>>, tensor<3x3x207x16x!quant.uniform<i8:f32:3, {0.1:-30}>>) -> tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
-- func.return %0 : tensor<1x8x8x16x!quant.uniform<i8:f32:0, {2.0:-30}>>
--}
+-  Value hornerSum = zero;
+-  Value factor = one;
++  Value correctionEulerMaclaurin = rewriter.create<DivOp>(
++      loc, rewriter.create<MulOp>(loc, qNegPower, acc),
++      rewriter.create<SubtractOp>(loc, x, oneLikeX));
++
++  // Manual reciprocal of the square root as RsqrtOp produces different results
++  Value rsqrtAcc = rewriter.create<DivOp>(
++      loc, one, rewriter.create<MulOp>(loc, acc, acc));
++
+   // Use Horner's rule for this.
+   // Note this differs from Cephes which does a 'naive' polynomial evaluation.
+   // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+   // resulting in more numerically stable code.
+-  for (int i = 0; i < 11; ++i) {
+-    Value factorLhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 22 - 2 * i, x));
+-    Value factorRhs = rewriter.create<mlir::stablehlo::AddOp>(
+-        loc, x, getConstantLike(rewriter, loc, 21 - 2 * i, x));
+-    factor = rewriter.create<mlir::stablehlo::MulOp>(loc, factorLhs, factorRhs);
+-    hornerSum = rewriter.create<mlir::stablehlo::MulOp>(
+-        loc, factor,
+-        rewriter.create<mlir::stablehlo::MulOp>(
+-            loc, aInverseSquare,
+-            rewriter.create<mlir::stablehlo::AddOp>(
++  Value hornerSum = zero;
++  Value hornerProduct = one;
++
++  for (int i = 0; i < kTerms - 1; ++i) {
++    Value factorLhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 1 - 2 * i, x));
++    Value factorRhs = rewriter.create<AddOp>(
++        loc, x,
++        getConstantLike(rewriter, loc, kTwoTermsMinusOne - 2 - 2 * i, x));
++    hornerProduct =
++        rewriter.create<MulOp>(loc, factorLhs, factorRhs);
++    hornerSum = rewriter.create<MulOp>(
++        loc, hornerProduct,
++        rewriter.create<MulOp>(
++            loc, rsqrtAcc,
++            rewriter.create<AddOp>(
+                 loc, hornerSum,
+-                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
+-  }
+-  Value zeroPointFiveLikeNegPower =
+-      getConstantLike(rewriter, loc, .5, negPower);
+-  Value xDivA = rewriter.create<mlir::stablehlo::DivOp>(loc, x, a);
+-  s = rewriter.create<mlir::stablehlo::AddOp>(
+-      loc, s,
+-      rewriter.create<mlir::stablehlo::MulOp>(
+-          loc, negPower,
+-          rewriter.create<mlir::stablehlo::AddOp>(
+-              loc, zeroPointFiveLikeNegPower,
+-              rewriter.create<mlir::stablehlo::MulOp>(
+-                  loc, xDivA,
+-                  rewriter.create<mlir::stablehlo::AddOp>(
+-                      loc,
+-                      getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11], a),
+-                      hornerSum)))));
++                getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], acc))));
++  }
++  Value zeroPointFiveLikeQNegPower =
++      getConstantLike(rewriter, loc, .5, qNegPower);
++  Value xDivAcc = rewriter.create<DivOp>(loc, x, acc);
++  Value bernoulliTailTerm = rewriter.create<MulOp>(
++      loc, qNegPower,
++      rewriter.create<AddOp>(
++          loc, zeroPointFiveLikeQNegPower,
++          rewriter.create<MulOp>(
++              loc, xDivAcc,
++              rewriter.create<AddOp>(
++                  loc,
++                  getConstantLike(rewriter, loc, 1. / kZetaCoeffs[kTerms - 1],
++                                  acc),
++                  hornerSum))));
++  Value accurateResult = rewriter.create<AddOp>(
++      loc,
++      rewriter.create<AddOp>(loc, powerSum,
++                                              correctionEulerMaclaurin),
++      bernoulliTailTerm);
+ 
+   // Use the initial zeta sum without the correction term coming
+   // from Euler-Maclaurin if it is accurate enough.
+-  Value absNegPower = rewriter.create<mlir::stablehlo::AbsOp>(loc, negPower);
+-  Value absInitialSum =
+-      rewriter.create<mlir::stablehlo::AbsOp>(loc, initialSum);
+-  Value output = rewriter.create<mlir::stablehlo::SelectOp>(
++  Value absQNegPower = rewriter.create<AbsOp>(loc, qNegPower);
++  Value absPowerSum = rewriter.create<AbsOp>(loc, powerSum);
++  Value output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, absNegPower,
+-          rewriter.create<mlir::stablehlo::MulOp>(
+-              loc, absInitialSum,
+-              getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
+-          mlir::stablehlo::ComparisonDirection::LT),
+-      initialSum, s);
++      rewriter.create<CompareOp>(
++          loc, absQNegPower,
++          rewriter.create<MulOp>(
++              loc, absPowerSum,
++              getConstantLikeSmallestFiniteValue(rewriter, loc, acc)),
++          ComparisonDirection::LT),
++      powerSum, accurateResult);
+ 
+   // Function is not defined for x < 1.
+   Value nan = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::quiet_NaN(), x);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, oneLikeX, mlir::stablehlo::ComparisonDirection::LT),
++      rewriter.create<CompareOp>(
++          loc, x, oneLikeX, ComparisonDirection::LT),
+       nan, output);
+ 
+   // For q <= 0, x must be an integer.
+-  Value qLeZero = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, zero, mlir::stablehlo::ComparisonDirection::LE);
+-  Value xNotInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::NE);
++  Value qLeZero = rewriter.create<CompareOp>(
++      loc, q, zero, ComparisonDirection::LE);
++  Value xNotInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::NE);
+   Value xDomainError =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, xNotInt);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(loc, xDomainError, nan,
++      rewriter.create<AndOp>(loc, qLeZero, xNotInt);
++  output = rewriter.create<SelectOp>(loc, xDomainError, nan,
+                                                       output);
+ 
+   // For all integer q <= 0, zeta has a pole. The limit is only defined as
+   // +inf if x is and even integer.
+   Value inf = getConstantLike(rewriter, loc,
+                               std::numeric_limits<double>::infinity(), x);
+-  Value qIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, q, rewriter.create<mlir::stablehlo::FloorOp>(loc, q),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value atPole = rewriter.create<mlir::stablehlo::AndOp>(loc, qLeZero, qIsInt);
++  Value qIsInt = rewriter.create<CompareOp>(
++      loc, q, rewriter.create<FloorOp>(loc, q),
++      ComparisonDirection::EQ);
++  Value atPole = rewriter.create<AndOp>(loc, qLeZero, qIsInt);
+   Value two = getConstantLike(rewriter, loc, 2.0, x);
+-  Value xIsInt = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, x, rewriter.create<mlir::stablehlo::FloorOp>(loc, x),
+-      mlir::stablehlo::ComparisonDirection::EQ);
+-  Value xIsEven = rewriter.create<mlir::stablehlo::CompareOp>(
+-      loc, rewriter.create<mlir::stablehlo::RemOp>(loc, x, two), zero,
+-      mlir::stablehlo::ComparisonDirection::EQ);
++  Value xIsInt = rewriter.create<CompareOp>(
++      loc, x, rewriter.create<FloorOp>(loc, x),
++      ComparisonDirection::EQ);
++  Value xIsEven = rewriter.create<CompareOp>(
++      loc, rewriter.create<RemOp>(loc, x, two), zero,
++      ComparisonDirection::EQ);
+   Value xIsEvenInt =
+-      rewriter.create<mlir::stablehlo::AndOp>(loc, xIsInt, xIsEven);
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++      rewriter.create<AndOp>(loc, xIsInt, xIsEven);
++  output = rewriter.create<SelectOp>(
+       loc, atPole,
+-      rewriter.create<mlir::stablehlo::SelectOp>(loc, xIsEvenInt, inf, nan),
++      rewriter.create<SelectOp>(loc, xIsEvenInt, inf, nan),
+       output);
+ 
+   // For x = 1, this is the harmonic series and diverges.
+-  output = rewriter.create<mlir::stablehlo::SelectOp>(
++  output = rewriter.create<SelectOp>(
+       loc,
+-      rewriter.create<mlir::stablehlo::CompareOp>(
+-          loc, x, one, mlir::stablehlo::ComparisonDirection::EQ),
++      rewriter.create<CompareOp>(
++          loc, x, one, ComparisonDirection::EQ),
+       inf, output);
+ 
+   return output;
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 674fda41985b9a..ca2f3a937f73ad 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "aa69baea1409d7c341705e0e9342ed62802d8a4d"
-    STABLEHLO_SHA256 = "4a95367f09657343704dff27c651bb0e648a205d5b82acbe35cd160b0d87ff35"
+    STABLEHLO_COMMIT = "271e8634de184fbfafd677d3876170feb6d08c97"
+    STABLEHLO_SHA256 = "06db84c751bd4a980dc76249e02f10e119175fceba3eebed008da122cb480bab"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/triton/cl614757739.patch b/third_party/xla/third_party/triton/cl614757739.patch
deleted file mode 100644
index 058238e0005310..00000000000000
--- a/third_party/xla/third_party/triton/cl614757739.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/python/src/ir.cc#8 - /google/src/cloud/shyshkov/mlir_a924da6d4b8733e5bf08098b18dd7ad1a5ba5f46_1710185763/triton/python/src/ir.cc ====
-# action=edit type=text
---- triton/python/src/ir.cc	2024-02-21 21:40:18.000000000 -0800
-+++ triton/python/src/ir.cc	2024-03-11 12:39:44.000000000 -0700
-@@ -205,7 +205,8 @@
-   });
- 
-   py::class_<Type>(m, "type", py::module_local())
--      .def("is_integer", &Type::isInteger)
-+      .def("is_integer",
-+           [](Type &self, unsigned width) { return self.isInteger(width); })
-       .def("is_fp16", &Type::isF16)
-       .def("__str__", [](Type &self) {
-         std::string str;
diff --git a/third_party/xla/third_party/triton/cl616113650.patch b/third_party/xla/third_party/triton/cl616113650.patch
deleted file mode 100644
index 4dc3675d9a9ea5..00000000000000
--- a/third_party/xla/third_party/triton/cl616113650.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#30 - triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-11 11:42:57.000000000 -0700
-+++ triton/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp	2024-03-15 06:57:27.000000000 -0700
-@@ -510,10 +510,8 @@
-                                    ConversionPatternRewriter &rewriter,
-                                    Type elemTy, MultipleOperandsRange operands,
-                                    Location loc) const {
--    auto boolFalse = rewriter.getBoolAttr(false);
--    auto constFalse = rewriter.create<LLVM::ConstantOp>(loc, boolFalse);
-     return {rewriter.create<LLVM::AbsOp>(loc, elemTy, operands[0][0],
--                                         /*is_int_min_poison=*/constFalse)};
-+                                         /*is_int_min_poison=*/false)};
-   }
- };
- 
diff --git a/third_party/xla/third_party/triton/cl617812302.patch b/third_party/xla/third_party/triton/cl617812302.patch
new file mode 100644
index 00000000000000..c9c8066190e6ce
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl617812302.patch
@@ -0,0 +1,12 @@
+==== triton/BUILD#42 - /google/src/cloud/csigg/mlir_83e5a1239242d64110e3dfa96ed3889170ab96b2_1711020969/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-21 04:26:30.000000000 -0700
++++ triton/BUILD	2024-03-21 05:39:41.000000000 -0700
+@@ -692,6 +692,7 @@
+         "@llvm-project//mlir:SCFDialect",
+         "@llvm-project//mlir:SCFTransforms",
+         "@llvm-project//mlir:SCFUtils",
++        "@llvm-project//mlir:SideEffectInterfaces",
+         "@llvm-project//mlir:Support",
+         "@llvm-project//mlir:TensorDialect",
+         "@llvm-project//mlir:TransformUtils",
diff --git a/third_party/xla/third_party/triton/cl619146327.patch b/third_party/xla/third_party/triton/cl619146327.patch
new file mode 100644
index 00000000000000..4f9c1e4b971c0a
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl619146327.patch
@@ -0,0 +1,52 @@
+This patch can be removed once this commit is included:
+https://github.com/openai/triton/commit/6ea5b56015db9e0bcff45ec7116cfcbfa729a516
+
+diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
++++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+@@ -305,9 +305,10 @@ compared to 1*64 when the hasLeadingOffs
+           int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+           perPhase = std::max<int>(perPhase, 1);
+           std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
+-          // for now, disable swizzle when using transposed int8 tensor cores
+-          if ((32 / typeWidthInBit != dotOpEnc.getKWidth()) && order[0] == inner)
+-            return get(context, 1, 1, 1, order, CTALayout);
++          int vecWidth = 32 / typeWidthInBit;
++          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
++              perPhase = std::max<int>(perPhase, 2 * vecWidth);
++          }
+           int rank = order.size();
+           // --- handle A operand ---
+           if (opIdx == 0) { // compute swizzling for A operand
+diff --git a/test/TritonGPU/reduce-data-duplication.mlir b/test/TritonGPU/reduce-data-duplication.mlir
+new file mode 100644
+--- /dev/null
++++ b/test/TritonGPU/reduce-data-duplication.mlir
+@@ -0,0 +1,14 @@
++// RUN: triton-opt %s -split-input-file -tritongpu-reduce-data-duplication | FileCheck %s
++
++//       CHECK:   #[[SHARED:.*]] = #triton_gpu.shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}
++//       CHECK:   apply_swizzle
++//       CHECK:   %{{.*}} = triton_gpu.local_alloc %{{.*}} : (tensor<16x256xf16, #{{.*}}>) -> !tt.memdesc<16x256xf16, #[[SHARED]]>
++
++#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 4], order = [0, 1]}>
++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 4], instrShape = [16, 8]}>
++module attributes {"triton_gpu.compute-capability" = 80 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
++  tt.func @apply_swizzle(%arg0: tensor<16x256xf16, #blocked>) {
++    %0 = triton_gpu.convert_layout %arg0 : tensor<16x256xf16, #blocked> -> tensor<16x256xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> 
++    tt.return 
++  } 
++}
+\ No newline at end of file
+diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+--- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
++++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMMAv2.cpp
+@@ -541,8 +541,6 @@ getLoadMatrixFn(MemDescType descTy, cons
+   const int elemBytes = descTy.getElementTypeBitWidth() / 8;
+   auto order = sharedLayout.getOrder();
+ 
+-  if (kWidth != (4 / elemBytes))
+-    assert(vecPhase == 1 || vecPhase == 4 * kWidth);
+   int nPerWarp =
+       std::max<int>(shapePerCTA[2] / mmaLayout.getWarpsPerCTA()[2], 8);
+ 
diff --git a/third_party/xla/third_party/triton/cl619443019.patch b/third_party/xla/third_party/triton/cl619443019.patch
new file mode 100644
index 00000000000000..95ce54b6e4d6aa
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl619443019.patch
@@ -0,0 +1,76 @@
+==== triton/BUILD#44 - /google/src/cloud/csigg/mlir_transform_utils/triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2024-03-22 08:02:38.000000000 -0700
++++ triton/BUILD	2024-03-27 01:34:43.000000000 -0700
+@@ -620,6 +620,7 @@
+         "@llvm-project//mlir:FunctionInterfaces",
+         "@llvm-project//mlir:GPUDialect",
+         "@llvm-project//mlir:IR",
++        "@llvm-project//mlir:InliningUtils",
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:MathDialect",
+         "@llvm-project//mlir:SCFDialect",
+@@ -628,6 +629,7 @@
+         # The following is added to make Utility compile
+         ":TritonTools",
+         "@llvm-project//mlir:LLVMCommonConversion",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -646,6 +648,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+     alwayslink = True,  # TritonDialect uses getCanonicalizationPatterns().
+@@ -729,6 +732,7 @@
+         "@llvm-project//mlir:LLVMDialect",
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+@@ -780,6 +784,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:IndexDialect",
+         "@llvm-project//mlir:Pass",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+     ],
+ )
+==== triton/test/BUILD#18 - /google/src/cloud/csigg/mlir_transform_utils/triton/test/BUILD ====
+# action=edit type=text
+--- triton/test/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/test/BUILD	2024-03-27 01:32:04.000000000 -0700
+@@ -53,6 +53,7 @@
+         "@llvm-project//mlir:IR",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
+==== triton/third_party/nvidia/BUILD#3 - /google/src/cloud/csigg/mlir_transform_utils/triton/third_party/nvidia/BUILD ====
+# action=edit type=text
+--- triton/third_party/nvidia/BUILD	2024-03-11 11:42:57.000000000 -0700
++++ triton/third_party/nvidia/BUILD	2024-03-27 01:32:46.000000000 -0700
+@@ -66,6 +66,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:Support",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonDialects",
+     ],
+@@ -113,6 +114,7 @@
+         "@llvm-project//mlir:NVVMDialect",
+         "@llvm-project//mlir:Pass",
+         "@llvm-project//mlir:SCFToControlFlow",
++        "@llvm-project//mlir:TransformUtils",
+         "@llvm-project//mlir:Transforms",
+         "//:TritonAnalysis",
+         "//:TritonDialects",
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 8eabda8c9eddcd..2773b250ac8554 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl612852008"
-    TRITON_SHA256 = "f2e330075469dd230ae5c72e4a2c9765c7851f7f6de5ad10f58578c07fc1dca4"
+    TRITON_COMMIT = "cl617459344"
+    TRITON_SHA256 = "f23e65175a67b1091ab6782720b0bcb969f33c19cae8168bf93eea523dab8a3f"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,7 +15,8 @@ def repo():
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
             "//third_party/triton:cl607293980.patch",  # long standing :(
-            "//third_party/triton:cl614757739.patch",
-            "//third_party/triton:cl616113650.patch",
+            "//third_party/triton:cl617812302.patch",
+            "//third_party/triton:cl619146327.patch",
+            "//third_party/triton:cl619443019.patch",
         ],
     )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 1e297772408ba6..d8990ac5c12cc5 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -597,8 +597,12 @@ try-import %workspace%/.bazelrc.user
 # Build TensorFlow v2.
 test:release_base --test_size_filters=small,medium
 
+# Ensure release_base is set on linux
+build:release_linux_base --config=release_base
+
 # Target the AVX instruction set
 build:release_linux_base --config=avx_linux
+
 # Enable support for all targets
 build:release_base --config=cpu_cross
 
@@ -679,12 +683,14 @@ build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gc
 build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
 
 build:release_cpu_macos --config=avx_linux
-test:release_cpu_macos --config=release_base
 
 # Base build configs for macOS
 build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
 build:release_macos_base --define=no_nccl_support=true --output_filter=^$
 
+# Ensure release_base is set on mac
+build:release_macos_base --config=release_base
+
 # Build configs for macOS x86
 build:release_macos_x86 --config=release_macos_base
 # Build with the AVX instruction set when on macOS x86
@@ -714,10 +720,12 @@ test:release_macos_x86 --config=release_macos_base
 # Test configs for macOS Arm64
 test:release_macos_arm64 --config=release_macos_base
 
+# Ensure release_base is set on windows
+build:release_cpu_windows --config=release_base
+
 # TODO(kanglan): Update windows configs after b/289091160 is fixed
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
-test:release_cpu_windows --config=release_base
 
 # Exclude TFRT integration for anything but Linux.
 build:android --config=no_tfrt
@@ -922,3 +930,6 @@ build:rbe_cross_compile_macos_x86 --jobs=100
 test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
+
+# Try to load the XLA warnings config if available
+try-import %workspace%/warnings.bazelrc
diff --git a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh b/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
deleted file mode 100755
index 8870bb2818e52d..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/generate_index_html.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Generates a handy index.html with a bunch of Kokoro links for GitHub
-# presubmits.
-# Usage: generate_index_html.sh /path/to/output/index.html
-
-tee "$1" <<EOF
-<html>
-<head>
-<title>#$KOKORO_GITHUB_PULL_REQUEST_NUMBER_tsl | $(basename "$KOKORO_JOB_NAME")</title>
-</head>
-<body>
-<h1>TSL Job Logs and Links</h1>
-<h2>Job Details</h2>
-<ul>
-<li>Job name: $KOKORO_JOB_NAME</li>
-<li>Job pool: $KOKORO_JOB_POOL</li>
-<li>Job ID: $KOKORO_BUILD_ID</li>
-<li>Current HEAD Piper Changelist (may be empty): cl/${KOKORO_PIPER_CHANGELIST:-not available}</li>
-<li>Pull Request Number: $KOKORO_GITHUB_PULL_REQUEST_NUMBER_tsl</li>
-<li>Pull Request Link: <a href="$KOKORO_GITHUB_PULL_REQUEST_URL_tsl">$KOKORO_GITHUB_PULL_REQUEST_URL_tsl</a></li>
-<li>Commit: $KOKORO_GIT_COMMIT_tsl</li>
-</ul>
-<h2>Googlers-Only Links</h2>
-<ul>
-<li><a href="http://sponge2/$KOKORO_BUILD_ID">Sponge2</a></li>
-<li><a href="http://sponge/target:$KOKORO_JOB_NAME">Sponge - recent jobs</a></li>
-</ul>
-<h2>Non-Googler Links</h2>
-<ul>
-<li><a href="https://source.cloud.google.com/results/invocations/$KOKORO_BUILD_ID">ResultStore</a></li>
-</ul>
-</body></html>
-EOF
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/build.sh b/third_party/xla/third_party/tsl/.kokoro/linux/build.sh
deleted file mode 100644
index f05e02bfc6cc7b..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/build.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# -o history: record shell history
-set -euo pipefail -o history
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-ADDITIONAL_FLAGS=""
-TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial,-gpu,-requires-gpu-nvidia"
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-# Pull the container (in case it was updated since the instance started) and
-# store its SHA in the Sponge log.
-docker pull "$DOCKER_IMAGE"
-echo "TF_INFO_DOCKER_IMAGE,$DOCKER_IMAGE" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
-echo "TF_INFO_DOCKER_SHA,$(docker pull "$DOCKER_IMAGE" | sed -n '/Digest:/s/Digest: //g p')" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
-
-# Start a container in the background
-docker run --name tsl -w /tf/tsl -itd --rm \
-    -v "$KOKORO_ARTIFACTS_DIR/github/tsl:/tf/tsl" \
-    "$DOCKER_IMAGE" \
-    bash
-
-# Build TSL
-docker exec tsl bazel --bazelrc=/usertools/cpu.bazelrc build \
-    --output_filter="" \
-    --keep_going \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/linux" \
-    $ADDITIONAL_FLAGS \
-    -- //tsl/...
-
-# Test TSL
-docker exec tsl bazel --bazelrc=/usertools/cpu.bazelrc test \
-    --output_filter="" \
-    --keep_going \
-    --flaky_test_attempts=3 \
-    --test_output=errors \
-    --build_tests_only \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --verbose_failures=true \
-    -- //tsl/...
-
-# Stop container
-docker stop tsl
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
deleted file mode 100644
index 8e105be39e67c0..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/build_cpu.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-build_file: "tsl/.kokoro/linux/build.sh"
-env_vars: {
-  key: "DOCKER_IMAGE"
-  value: "gcr.io/tensorflow-sigs/build:latest-python3.9"
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg
deleted file mode 100644
index e23a70ebe5127d..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/linux/cpu/common.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-action {
-  define_artifacts {
-    # Sponge logs
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    # Full test logs to debug
-    regex: "**/*.tar.gz"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/build.sh b/third_party/xla/third_party/tsl/.kokoro/macos/build.sh
deleted file mode 100644
index 5f4a806d87be23..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/build.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# -o history: record shell history
-set -euo pipefail -o history
-
-cd "${KOKORO_ARTIFACTS_DIR}/github/tsl"
-
-# Install Bazelisk, Bats, Pyenv, Python, upgrade pip, and activate ".tf-venv"
-# virtual environment. We use the "PYENV_VERSION" variable here to decide which
-# Python version to install. In addition, we update $PATH with the PYENV_ROOT
-# environment variable and we set STATIC_DEPS=true for installing lxml for
-# Python. Finally, we set up a symlink to the Python packages directory in
-# ".tf-venv" which is referenced in macos.bazelrc.
-function install_build_env_tools(){
-  # Install Bazelisk; Useful as it would automatically pick the correct
-  # version of Bazel.
-  echo "===== Installing Bazelisk ====="
-  sudo wget --no-verbose -O "/usr/local/bin/bazel" \
-      "https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64" \
-      && chmod +x "/usr/local/bin/bazel"
-
-  echo "===== Installing Pyenv ====="
-  # Install pyenv; Set up a virtual environment to control dependencies and their
-  # versions
-  git clone --branch v2.3.17 https://github.com/pyenv/pyenv.git /Users/kbuilder/.tf_pyenv
-  export PYENV_ROOT=/Users/kbuilder/.tf_pyenv
-  export PATH="$PYENV_ROOT/bin:$PATH"    # if `pyenv` is not already on PATH
-  eval "$(pyenv init --path)"
-  eval "$(pyenv init -)"
-
-  echo "===== Installing Python ====="
-  # Install Python and set the local python version
-  pyenv install -s "${TF_PYENV_VERSION}"
-  pyenv rehash
-  pyenv local "${TF_PYENV_VERSION}"
-  # Do a sanity check to make sure that we using the correct Python version
-  echo "===== Python version ====="
-  python --version
-  # Set up virtual environment and activate it
-  python -m venv /Users/kbuilder/.tf-venv && source /Users/kbuilder/.tf-venv/bin/activate
-
-  # Setup links to Python. Referenced in ./macos.bazelrc
-  ln -s /Users/kbuilder/.tf-venv/lib/python* /Users/kbuilder/.tf-venv/lib/python
-
-  echo "===== Upgrading to latest pip ====="
-  python -m pip install --upgrade pip
-}
-
-install_build_env_tools
-
-python -m pip install numpy==1.21.4
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-# Set authentication for reading and writing cache from Google Cloud Storage
-export GOOGLE_APPLICATION_CREDENTIALS="$KOKORO_KEYSTORE_DIR/73361_tensorflow_bazel_cache_writer"
-
-TAGS_FILTER="-no_oss,-oss_excluded,-gpu,-no_mac,-nomac,-mac_excluded"
-ADDITIONAL_FLAGS=""
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-# Build TSL
-bazel build \
-    --output_filter="" \
-    --macos_minimum_os=10.15 \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --keep_going \
-    --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/macos" \
-    $ADDITIONAL_FLAGS \
-    -- //tsl/...
-
-# Test TSL
-bazel test \
-    --output_filter="" \
-    --macos_minimum_os=10.15 \
-    --test_tag_filters=-no_mac,-nomac,-mac_excluded \
-    --keep_going \
-    --test_output=errors \
-    --build_tests_only \
-    --build_tag_filters=$TAGS_FILTER  \
-    --test_tag_filters=$TAGS_FILTER \
-    --verbose_failures=true \
-    --flaky_test_attempts=3 \
-    -- //tsl/...
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg
deleted file mode 100644
index b12f5c22a4f835..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/common.cfg
+++ /dev/null
@@ -1,25 +0,0 @@
-# Not sure how long the timeout should be
-timeout_mins: 720
-
-action {
-  define_artifacts {
-    # Sponge logs
-    regex: "**/sponge_log.xml"
-    regex: "**/sponge_log.log"
-    # Full test logs to debug the log squasher, and libtf.tar.gz
-    regex: "**/*.tar.gz"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
-
-before_action {
-  fetch_keystore {
-  # Authentication for reading and writing cache to/from Google Cloud Storage
-    keystore_resource {
-      keystore_config_id: 73361
-      keyname: "tensorflow_bazel_cache_writer"
-      backend: "blade:keystore-fastconfigpush"  # disable-keystore-reliability-check
-    }
-  }
-}
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg b/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg
deleted file mode 100644
index 885373fc8c1d0b..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/macos/cpu/cpu_py39_full.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-build_file: "tsl/.kokoro/macos/build.sh"
-env_vars: {
-   key: "TF_PYENV_VERSION"
-   value: "3.9.16"
-}
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/build.bat b/third_party/xla/third_party/tsl/.kokoro/windows/build.bat
deleted file mode 100644
index 4fb8ddf28ed4b4..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/build.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-@REM Copyright 2023 Google LLC
-
-@REM Licensed under the Apache License, Version 2.0 (the "License");
-@REM you may not use this file except in compliance with the License.
-@REM You may obtain a copy of the License at
-
-@REM     https://www.apache.org/licenses/LICENSE-2.0
-
-@REM Unless required by applicable law or agreed to in writing, software
-@REM distributed under the License is distributed on an "AS IS" BASIS,
-@REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@REM See the License for the specific language governing permissions and
-@REM limitations under the License.
-
-SET TMPDIR=T:/tmp
-SET TMP=%TMPDIR%
-SET TEMP=%TMPDIR%
-
-bash -l %0/../windows_build.sh %*
-exit /b %ERRORLEVEL%
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg b/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg
deleted file mode 100644
index 3a935b23ed8280..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/build_cpu_py39.cfg
+++ /dev/null
@@ -1 +0,0 @@
-build_file: "tsl/.kokoro/windows/build.bat"
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg b/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg
deleted file mode 100644
index 8f936b4071a993..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/cpu/common.cfg
+++ /dev/null
@@ -1,12 +0,0 @@
-# timeout_mins: 6000
-action {
-  define_artifacts {
-    regex: "**/sponge_log.xml"
-    # regex: "**/.tf_configure.bazelrc"
-    regex: "**/lib_package/*"
-    # regex: "**/java.log"
-    # regex: "**/win_minidumps/*.dmp"
-    # Html helper for presubmits
-    regex: "**/*.html"
-  }
-}
diff --git a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh b/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
deleted file mode 100644
index 4f4b0a0fdf9d31..00000000000000
--- a/third_party/xla/third_party/tsl/.kokoro/windows/windows_build.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2022 Google LLC All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -e: abort script if one command fails
-# -u: error if undefined variable used
-# -o pipefail: entire command fails if pipe fails. watch out for yes | ...
-# Note: set -x <code> +x around anything you want to have logged.
-set -euo pipefail
-
-cd "${KOKORO_ARTIFACTS_DIR}/github/tsl"
-
-# Generate a templated results file to make output accessible to everyone
-"$KOKORO_ARTIFACTS_DIR"/github/tsl/.kokoro/generate_index_html.sh "$KOKORO_ARTIFACTS_DIR"/index.html
-
-function is_continuous_job() {
-  [[ "$KOKORO_JOB_NAME" =~ tensorflow/tsl/.*continuous.* ]]
-}
-
-ADDITIONAL_FLAGS=""
-TAGS_FILTER="-no_oss,-oss_excluded,-gpu,-no_windows,-windows_excluded"
-
-if is_continuous_job ; then
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --google_default_credentials"
-else
-    ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --remote_upload_local_results=false"
-fi
-
-export PATH="$PATH:/c/Python38"
-
-# Build TSL
-/c/tools/bazel.exe build \
-  --output_filter="" \
-  --keep_going \
-  --build_tag_filters=$TAGS_FILTER  \
-  --test_tag_filters=$TAGS_FILTER \
-  --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/tsl/windows" \
-    $ADDITIONAL_FLAGS \
-  -- //tsl/... \
-  || { echo "Bazel Build Failed" && exit 1; }
-
-# Test TSL
-/c/tools/bazel.exe test \
-  --output_filter="" \
-  --flaky_test_attempts=3 \
-  --test_output=errors \
-  --build_tests_only \
-  --verbose_failures=true \
-  --build_tag_filters=$TAGS_FILTER  \
-  --test_tag_filters=$TAGS_FILTER \
-  --keep_going \
-  -- //tsl/... \
-  || { echo "Bazel Test Failed" && exit 1; }
-
-exit 0
diff --git a/third_party/xla/third_party/tsl/README.md b/third_party/xla/third_party/tsl/README.md
index ff61283c5e5351..6f4ab2025257aa 100644
--- a/third_party/xla/third_party/tsl/README.md
+++ b/third_party/xla/third_party/tsl/README.md
@@ -10,16 +10,6 @@ This repo contains base utilities and cross-platform support for projects like
 > [upstream location](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tsl)
 > to make any contributions or report any issues.
 
-## Contacts
-
-Discord TBA
-
-Community proposals TBA
-
-Community meetings TBA
-
-Additional contacts TBA
-
 ## Code of Conduct
 
 While under TensorFlow governance, all community spaces are subject to the
diff --git a/third_party/xla/third_party/tsl/WORKSPACE b/third_party/xla/third_party/tsl/WORKSPACE
index cab76d7b38432a..45fe4fccfcc658 100644
--- a/third_party/xla/third_party/tsl/WORKSPACE
+++ b/third_party/xla/third_party/tsl/WORKSPACE
@@ -1,3 +1,4 @@
+# buildifier: disable=load-on-top
 workspace(name = "tsl")
 
 # Initialize the TSL repository and all dependencies.
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index 89ea8f54a3495e..fefbf081c87e1c 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -26,7 +26,6 @@
   * `PYTHON_BIN_PATH`: The python binary path
 """
 
-load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
     "escape_string",
@@ -38,6 +37,7 @@ load(
     "find_vc_path",
     "setup_vc_env_vars",
 )
+load("//third_party/clang_toolchain:download_clang.bzl", "download_clang")
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index c96ecf4d62eb64..b8b1aa92a5b037 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -8,13 +8,6 @@
   * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
-load(
-    ":cuda_configure.bzl",
-    "enable_cuda",
-    "make_copy_dir_rule",
-    "make_copy_files_rule",
-    "to_list_of_strings",
-)
 load(
     "//third_party/remote_config:common.bzl",
     "config_repo_label",
@@ -29,6 +22,13 @@ load(
     "realpath",
     "which",
 )
+load(
+    ":cuda_configure.bzl",
+    "enable_cuda",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+    "to_list_of_strings",
+)
 
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
diff --git a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
index e90235dd9fcc52..457cef024907ed 100644
--- a/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/hwloc/hwloc.BUILD
@@ -1,5 +1,8 @@
 # hwloc: Portable Hardware Locality Library
 
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
 package(
     default_visibility = ["//visibility:public"],
 )
@@ -8,9 +11,6 @@ licenses(["notice"])
 
 exports_files(["COPYING"])
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-
 COMMON_INCLUDE_COPTS = [
     "-I.",
     "-Ihwloc",
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
index 01dad3b169f402..37f36cc135fd6d 100644
--- a/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/workspace.bzl
@@ -6,8 +6,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "implib_so",
-        strip_prefix = "Implib.so-5fb84c2a750434b9df1da67d67b749eb929598f1",
-        sha256 = "10de0a616df24849f2a883747784c115f209708960e44556f5ce384de6f103e8",
-        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/5fb84c2a750434b9df1da67d67b749eb929598f1.tar.gz"),
+        strip_prefix = "Implib.so-2cce6cab8ff2c15f9da858ea0b68646a8d62aef2",
+        sha256 = "4ef3089969d57a5b60bb41b8212c478eaa15c56941f86d4bf5e7f98a3afd24e8",
+        urls = tf_mirror_urls("https://github.com/yugr/Implib.so/archive/2cce6cab8ff2c15f9da858ea0b68646a8d62aef2.tar.gz"),
         build_file = "//third_party/implib_so:implib_so.BUILD",
     )
diff --git a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
index 34ad101bd35036..e60c91bbe376b6 100644
--- a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
@@ -1,11 +1,6 @@
 # Build file for OpenMP library that is part of llvm
 
-load(
-    "@local_tsl//tsl:tsl.bzl",
-    "if_linux_x86_64",
-    "if_macos",
-    "if_windows",
-)
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load(
     "@local_tsl//third_party/llvm_openmp:cmake_vars.bzl",
     "cmake_var_string",
@@ -16,7 +11,12 @@ load(
     "dict_add",
     "libiomp5_cc_binary",
 )
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_linux_x86_64",
+    "if_macos",
+    "if_windows",
+)
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
index 1c4e4c86fa1134..d67b62a98d2660 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -1,7 +1,7 @@
-exports_files(["LICENSE"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
+exports_files(["LICENSE"])
+
 _DNNL_COPTS_THREADPOOL = [
     "-fopenmp-simd",
     "-fexceptions",
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
index abf24705dd3f48..021557f4c2bc76 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -1,7 +1,7 @@
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@local_tsl//third_party/mkl_dnn:build_defs.bzl", "if_mkldnn_openmp")
+load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
index 5d040b95dcd4b8..72f91a68474f97 100644
--- a/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/nccl/archive.BUILD
@@ -1,10 +1,6 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["notice"])
-
-exports_files(["LICENSE.txt"])
-
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
@@ -20,6 +16,10 @@ load(
     "GENERATED_SOURCES",
 )
 
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
 NCCL_MAJOR = 2
 
 NCCL_MINOR = 19
diff --git a/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD b/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
index 4d05ab28d12e99..c7d940605f9f72 100644
--- a/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/systemlibs/protobuf.BUILD
@@ -1,10 +1,10 @@
-load("@rules_proto//proto:defs.bzl", "proto_library")
 load(
     "@com_google_protobuf//:protobuf.bzl",
     "cc_proto_library",
     "proto_gen",
     "py_proto_library",
 )
+load("@rules_proto//proto:defs.bzl", "proto_library")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 0e4affe9180311..2b789c8cead317 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9c80654b96f0fcf47cb636500f47edbe1fbc555e"
-    TFRT_SHA256 = "491e224c2d8d4be312ed0de65d6ab5a6912aa1d18b03439f6531f399b72ff385"
+    TFRT_COMMIT = "968eb3e5b0aa2e20301a41af9bb14a48dd1aee40"
+    TFRT_SHA256 = "cd3b1d190625d6ca5ddcf1c9cc0b095928707b623f1b986f1ba333a89d5418ae"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl b/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
index 3c7543cba86f96..a648d3d8d646ac 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/def_file_filter_configure.bzl
@@ -19,9 +19,8 @@ symbols through this python script.
   * `VS140COMNTOOLS`
 """
 
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
-load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
 load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool", "find_vc_path")
 
 def _def_file_filter_configure_impl(repository_ctx):
     if repository_ctx.os.name.lower().find("windows") == -1:
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
index f1bb21d9100706..78c42c6f454c5c 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/symbols_pybind.txt
@@ -59,7 +59,7 @@ tensorflow::tfprof::SerializeToString
 [//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool] # graph_analyze
 tensorflow::grappler::graph_analyzer::GraphAnalyzerTool
 
-[//external/local_tsl/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
+[//external/local_xla/xla/tsl/python/lib/core:ml_dtypes_lib] # bfloat16, float8
 tsl::ml_dtypes::RegisterTypes
 tsl::ml_dtypes::GetBfloat16Dtype
 tsl::ml_dtypes::GetFloat8E4m3b11fnuzDtype
@@ -307,7 +307,7 @@ tensorflow::AddWhileInputHack
 tensorflow::RecordMutation
 tensorflow::Graph::IsControlEdge
 
-[//external/local_tsl/tsl/python/lib/core:numpy] # tf_session
+[//external/local_xla/xla/tsl/python/lib/core:numpy] # tf_session
 tsl::ImportNumpy
 _tsl_numpy_api
 
@@ -562,7 +562,8 @@ tensorflow::Safe_PyObjectPtr
 tensorflow::quantization::QuantizeQatModel
 tensorflow::quantization::QuantizePtqModelPreCalibration
 tensorflow::quantization::QuantizePtqModelPostCalibration
-tensorflow::quantization::QuantizePtqDynamicRange
+tensorflow::quantization::QuantizeStaticRangePtq
+tensorflow::quantization::QuantizeDynamicRangePtq
 tensorflow::quantization::QuantizeWeightOnly
 
 [//tensorflow/dtensor/cc:dtensor_device_cc] # DTensor
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 5f2057a3c14360..4455aea60109fa 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
index b1488584566aa6..18a84d96c39f82 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/third_party/tsl/tsl/BUILD
index 2019dd229631a3..fbd87cb54a5a14 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/BUILD
@@ -1,6 +1,6 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("tsl.bzl", "if_google", "if_oss")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
@@ -528,6 +528,9 @@ bzl_library(
     srcs = ["tsl.bzl"],
     visibility = ["//visibility:public"],
     deps = [
+        "//third_party/compute_library:build_defs_bzl",
+        "//third_party/mkl_dnn:build_defs_bzl",
+        "//tsl/platform:rules_cc_bzl",
         "@bazel_skylib//lib:new_sets",
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
index 4250b3db059897..a6294d99b40e52 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
@@ -1,5 +1,5 @@
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
index 479903d7841ca0..69980ac4a1d7a2 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
@@ -2,11 +2,11 @@
 # Distributed runtime modules for machine learning, which allows coordination between multiple
 # processes for distributed operations.
 
-load("//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
index afd7b3c06b78ff..a95081f56bd160 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -78,13 +78,13 @@ tsl_gpu_library(
         "//tsl/platform:thread_annotations",
         "//tsl/protobuf:coordination_config_proto_cc",
         "//tsl/protobuf:coordination_service_proto_cc",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
     alwayslink = 1,
 )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
index 4f55d9540824b0..45dfb972a131f5 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/distributed_runtime/call_options.h"
 #include "tsl/distributed_runtime/coordination/coordination_client.h"
 #include "tsl/distributed_runtime/coordination/coordination_service_error_util.h"
@@ -45,7 +46,6 @@ limitations under the License.
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/protobuf/coordination_config.pb.h"
 #include "tsl/protobuf/coordination_service.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 namespace {
@@ -109,32 +109,33 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 
   void LogConnectStatusLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
-  Status RegisterTask(const CoordinatedTask& task,
-                      uint64_t incarnation) override;
+  absl::Status RegisterTask(const CoordinatedTask& task,
+                            uint64_t incarnation) override;
   void WaitForAllTasks(const CoordinatedTask& task, const DeviceInfo& devices,
                        StatusCallback done) override;
   void ShutdownTaskAsync(const CoordinatedTask& task,
                          StatusCallback done) override;
-  Status ResetTask(const CoordinatedTask& task) override;
-  Status RecordHeartbeat(const CoordinatedTask& task,
-                         uint64_t incarnation) override;
-  Status ReportTaskError(const CoordinatedTask& task, Status error) override;
+  absl::Status ResetTask(const CoordinatedTask& task) override;
+  absl::Status RecordHeartbeat(const CoordinatedTask& task,
+                               uint64_t incarnation) override;
+  absl::Status ReportTaskError(const CoordinatedTask& task,
+                               absl::Status error) override;
   std::vector<CoordinatedTaskStateInfo> GetTaskState(
       const std::vector<CoordinatedTask>& task) override;
-  Status InsertKeyValue(const std::string& key,
-                        const std::string& value) override;
+  absl::Status InsertKeyValue(const std::string& key,
+                              const std::string& value) override;
   void GetKeyValueAsync(const std::string& key,
                         StatusOrValueCallback done) override;
   absl::StatusOr<std::string> TryGetKeyValue(const std::string& key) override;
   std::vector<KeyValueEntry> GetKeyValueDir(
       absl::string_view directory_key) override;
-  Status DeleteKeyValue(const std::string& key) override;
+  absl::Status DeleteKeyValue(const std::string& key) override;
   void BarrierAsync(const std::string& barrier_id, absl::Duration timeout,
                     const CoordinatedTask& task,
                     const std::vector<CoordinatedTask>& participating_tasks,
                     StatusCallback done) override;
-  Status CancelBarrier(const std::string& barrier_id,
-                       const CoordinatedTask& task) override;
+  absl::Status CancelBarrier(const std::string& barrier_id,
+                             const CoordinatedTask& task) override;
 
  private:
   const DeviceInfo& ListClusterDevices() override
@@ -144,22 +145,22 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
   void Stop(bool shut_staleness_thread = true);
   // Report service error to a specified task.
   void ReportServiceErrorToTaskAsync(const CoordinatedTask& destination_task,
-                                     Status error);
+                                     absl::Status error);
   // Report error from a task to all other connected tasks if the task is not
   // recoverable.
   // Note: SetTaskError() must be called before propagating its error.
   void PropagateError(const CoordinatedTask& source_task,
                       bool is_reported_by_task = false)
       TF_LOCKS_EXCLUDED(state_mu_);
-  void SetTaskError(absl::string_view task_name, Status error)
+  void SetTaskError(absl::string_view task_name, absl::Status error)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   void AggregateClusterDevices() TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
-  Status DisconnectTask(const CoordinatedTask& task)
+  absl::Status DisconnectTask(const CoordinatedTask& task)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
   struct BarrierState {
     bool passed = false;
-    Status result = errors::Unknown(
+    absl::Status result = errors::Unknown(
         "Invalid barrier result.");  // Only valid if `passed` is true.
     uint64_t deadline_in_micros = 0;
     int num_pending_tasks = 0;
@@ -169,7 +170,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
         tasks_at_barrier;
     std::vector<StatusCallback> done_callbacks;
   };
-  void PassBarrier(absl::string_view barrier_id, Status result,
+  void PassBarrier(absl::string_view barrier_id, absl::Status result,
                    BarrierState* barrier)
       TF_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
   // Check if participating tasks are specified correctly across barrier calls.
@@ -193,17 +194,17 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
     // tasks in the cluster.
 
     CoordinatedTaskState GetState() { return state_; }
-    Status GetStatus() { return status_; }
+    absl::Status GetStatus() { return status_; }
     uint64_t GetTaskIncarnation() { return task_incarnation_; }
     void SetConnected(uint64_t task_incarnation);
     void Disconnect(uint64_t grace_period_duration_us);
-    Status RecordHeartbeat(uint64_t task_incarnation);
+    absl::Status RecordHeartbeat(uint64_t task_incarnation);
     int64_t TimeSinceLastHeartbeatMs();
     // This denotes the deadline after which we stop accepting heartbeats from a
     // disconnected task. This grace period accounts for the lag time between
     // the service recording the state change and the agent stopping heartbeats.
     uint64_t GetDisconnectedGracePeriodMicros();
-    void SetError(Status status);
+    void SetError(absl::Status status);
     DeviceInfo GetDeviceInfo() { return devices_; }
     void CollectDeviceInfo(const DeviceInfo& devices) { devices_ = devices; }
     // Checks if task has called WaitForAllTasks() previously, which gathers the
@@ -219,7 +220,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
     uint64_t task_incarnation_ = 0;
 
     CoordinatedTaskState state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-    Status status_;
+    absl::Status status_;
     mutex last_heartbeat_mu_;
     uint64_t last_heartbeat_us_ TF_GUARDED_BY(last_heartbeat_mu_);
     // This denotes the deadline after which we stop accepting heartbeats from a
@@ -282,7 +283,7 @@ class CoordinationServiceStandaloneImpl : public CoordinationServiceInterface {
 void CoordinationServiceStandaloneImpl::TaskState::SetConnected(
     uint64_t task_incarnation) {
   state_ = CoordinatedTaskState::TASKSTATE_CONNECTED;
-  status_ = OkStatus();
+  status_ = absl::OkStatus();
   task_incarnation_ = task_incarnation;
   mutex_lock l(last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
@@ -293,17 +294,17 @@ void CoordinationServiceStandaloneImpl::TaskState::Disconnect(
   disconnect_grace_period_us_ =
       Env::Default()->NowMicros() + grace_period_duration_us;
   state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  status_ = OkStatus();
+  status_ = absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::TaskState::SetError(
-    const Status status) {
+    const absl::Status status) {
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
   status_ = status;
 }
 
-Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
+absl::Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
     uint64_t task_incarnation) {
   if (!status_.ok()) return status_;
   if (task_incarnation != task_incarnation_) {
@@ -313,7 +314,7 @@ Status CoordinationServiceStandaloneImpl::TaskState::RecordHeartbeat(
   }
   mutex_lock l(last_heartbeat_mu_);
   last_heartbeat_us_ = Env::Default()->NowMicros();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t
@@ -392,7 +393,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
             }
           }
           // Heartbeat check.
-          Status status = OkStatus();
+          absl::Status status = absl::OkStatus();
           {
             mutex_lock l(state_mu_);
             for (const auto& [task_name, task_state] : cluster_state_) {
@@ -463,7 +464,7 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
                   }
                 }
               }
-              const Status error =
+              const absl::Status error =
                   MakeCoordinationError(errors::DeadlineExceeded(absl::StrCat(
                       "Barrier timed out. Barrier_id: ", barrier_id,
                       ". Timed out task names:\n", pending_tasks)));
@@ -504,7 +505,7 @@ void CoordinationServiceStandaloneImpl::Stop(bool shut_staleness_thread) {
     mutex_lock l(state_mu_);
     for (auto& [barrier_id, barrier] : barriers_) {
       if (!barrier.passed) {
-        Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
+        absl::Status error = MakeCoordinationError(errors::Aborted(absl::StrCat(
             "Barrier failed because service is shutting down. Barrier_id: ",
             barrier_id)));
         PassBarrier(barrier_id, error, &barrier);
@@ -545,11 +546,11 @@ void CoordinationServiceStandaloneImpl::LogConnectStatusLocked() const {
   }
 }
 
-Status CoordinationServiceStandaloneImpl::RegisterTask(
+absl::Status CoordinationServiceStandaloneImpl::RegisterTask(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
 
-  Status error;
+  absl::Status error;
   std::string error_message;
   {
     mutex_lock l(state_mu_);
@@ -580,7 +581,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
                 << " has connected to coordination service. Incarnation: "
                 << incarnation;
       LogConnectStatusLocked();
-      return OkStatus();
+      return absl::OkStatus();
     } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
       // This may happen if the service processes the initial RegisterTask(),
       // but the agent did not receive the response so the agent retries again.
@@ -593,7 +594,7 @@ Status CoordinationServiceStandaloneImpl::RegisterTask(
                   << " has connected to coordination service with the same "
                   << "incarnation again: " << incarnation;
         LogConnectStatusLocked();
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         error_message =
             absl::StrCat(task_name,
@@ -643,7 +644,7 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
     BarrierAsync(shutdown_barrier_id_, shutdown_barrier_timeout_, task, {},
                  done);
   } else {
-    Status status;
+    absl::Status status;
     {
       mutex_lock l(state_mu_);
       // Disconnect task from service individually.
@@ -653,13 +654,13 @@ void CoordinationServiceStandaloneImpl::ShutdownTaskAsync(
   }
 }
 
-Status CoordinationServiceStandaloneImpl::ResetTask(
+absl::Status CoordinationServiceStandaloneImpl::ResetTask(
     const CoordinatedTask& task) {
   mutex_lock l(state_mu_);
   return DisconnectTask(task);
 }
 
-Status CoordinationServiceStandaloneImpl::DisconnectTask(
+absl::Status CoordinationServiceStandaloneImpl::DisconnectTask(
     const CoordinatedTask& task) {
   const std::string task_name = GetTaskName(task);
   // Check if task is valid and not already disconnected.
@@ -677,14 +678,14 @@ Status CoordinationServiceStandaloneImpl::DisconnectTask(
       /*grace_period_duration_us=*/heartbeat_timeout_ms_ * 1000);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
         "Barrier failed from a disconnected task. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
   }
 
   LOG(INFO) << task_name << " has disconnected from coordination service.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const DeviceInfo& CoordinationServiceStandaloneImpl::ListClusterDevices() {
@@ -695,8 +696,8 @@ uint64_t CoordinationServiceStandaloneImpl::GetServiceIncarnation() {
   return service_incarnation_;
 }
 
-Status CoordinationServiceStandaloneImpl::ReportTaskError(
-    const CoordinatedTask& task, Status error) {
+absl::Status CoordinationServiceStandaloneImpl::ReportTaskError(
+    const CoordinatedTask& task, absl::Status error) {
   const std::string& task_name = GetTaskName(task);
   {
     mutex_lock l(state_mu_);
@@ -712,7 +713,7 @@ Status CoordinationServiceStandaloneImpl::ReportTaskError(
     }
   }
   PropagateError(task, /*is_reported_by_task=*/true);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<CoordinatedTaskStateInfo>
@@ -722,7 +723,7 @@ CoordinationServiceStandaloneImpl::GetTaskState(
   for (const auto& task : tasks) {
     const std::string task_name = GetTaskName(task);
     auto& state_info = states_info.emplace_back();
-    Status error;
+    absl::Status error;
     {
       mutex_lock l(state_mu_);
       state_info.set_state(cluster_state_[task_name]->GetState());
@@ -739,10 +740,10 @@ CoordinationServiceStandaloneImpl::GetTaskState(
   return states_info;
 }
 
-Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
+absl::Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
     const CoordinatedTask& task, uint64_t incarnation) {
   const std::string& task_name = GetTaskName(task);
-  Status s = OkStatus();
+  absl::Status s = absl::OkStatus();
   {
     mutex_lock l(state_mu_);
     if (!cluster_state_.contains(task_name)) {
@@ -782,7 +783,7 @@ Status CoordinationServiceStandaloneImpl::RecordHeartbeat(
 }
 
 void CoordinationServiceStandaloneImpl::ReportServiceErrorToTaskAsync(
-    const CoordinatedTask& destination_task, Status error) {
+    const CoordinatedTask& destination_task, absl::Status error) {
   assert(!error.ok());
 
   // Don't report error if there is no service-to-client connection.
@@ -805,7 +806,7 @@ void CoordinationServiceStandaloneImpl::ReportServiceErrorToTaskAsync(
   CoordinationClient* client = client_cache_->GetClient(task_name);
   client->ReportErrorToTaskAsync(
       call_opts.get(), request.get(), response.get(),
-      [request, response, task_name, call_opts](Status s) {
+      [request, response, task_name, call_opts](absl::Status s) {
         if (!s.ok()) {
           LOG(ERROR) << "Encountered another error while reporting to "
                      << task_name << ": " << s;
@@ -818,7 +819,7 @@ void CoordinationServiceStandaloneImpl::PropagateError(
   // If the error task is recoverable, do not propagate the error to other
   // connected tasks.
   if (isRecoverableJob(source_task.job_name())) return;
-  Status error;
+  absl::Status error;
   {
     mutex_lock l(state_mu_);
     error = cluster_state_[GetTaskName(source_task)]->GetStatus();
@@ -864,7 +865,8 @@ void CoordinationServiceStandaloneImpl::PropagateError(
     auto response = std::make_shared<ReportErrorToTaskResponse>();
     auto n = std::make_shared<absl::Notification>();
     client->ReportErrorToTaskAsync(
-        &call_opts, &request, response.get(), [response, n, task](Status s) {
+        &call_opts, &request, response.get(),
+        [response, n, task](absl::Status s) {
           if (!s.ok()) {
             LOG(ERROR) << "Encountered another error while reporting to "
                        << task << ": " << s;
@@ -906,7 +908,7 @@ std::string NormalizeKey(const StringPiece orig_key) {
   return norm_key;
 }
 
-Status CoordinationServiceStandaloneImpl::InsertKeyValue(
+absl::Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     const std::string& key, const std::string& value) {
   VLOG(3) << "InsertKeyValue(): " << key << ": " << value;
   const std::string& norm_key = NormalizeKey(key);
@@ -923,7 +925,7 @@ Status CoordinationServiceStandaloneImpl::InsertKeyValue(
     }
     get_cb_.erase(iter);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::GetKeyValueAsync(
@@ -984,7 +986,7 @@ std::vector<KeyValueEntry> CoordinationServiceStandaloneImpl::GetKeyValueDir(
   return kvs_in_directory;
 }
 
-Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
+absl::Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
     const std::string& key) {
   VLOG(3) << "DeleteKeyValue(): " << key;
   const std::string& norm_key = NormalizeKey(key);
@@ -1003,15 +1005,15 @@ Status CoordinationServiceStandaloneImpl::DeleteKeyValue(
   if (iter != kv_store_.end()) {
     kv_store_.erase(iter);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CoordinationServiceStandaloneImpl::SetTaskError(
-    absl::string_view task_name, Status error) {
+    absl::string_view task_name, absl::Status error) {
   cluster_state_[task_name]->SetError(error);
   for (const auto& barrier_id :
        cluster_state_[task_name]->GetOngoingBarriers()) {
-    Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
+    absl::Status error = MakeCoordinationError(errors::Internal(absl::StrCat(
         "Barrier failed from a task error. Barrier Id: ", barrier_id,
         ", Task: ", task_name)));
     PassBarrier(barrier_id, error, &barriers_[barrier_id]);
@@ -1049,7 +1051,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
         // barrier.
         const std::string task_name = GetTaskName(task);
         if (!cluster_state_.contains(task_name)) {
-          Status error = MakeCoordinationError(errors::InvalidArgument(
+          absl::Status error = MakeCoordinationError(errors::InvalidArgument(
               absl::StrCat("Unexpected task (", task_name,
                            ") that is not in the cluster called the barrier. "
                            "Barrier Id: ",
@@ -1068,7 +1070,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
       const std::string task_name = GetTaskName(pending_task.first);
       if (cluster_state_[task_name]->GetState() ==
           CoordinatedTaskState::TASKSTATE_ERROR) {
-        Status error = MakeCoordinationError(errors::Internal(
+        absl::Status error = MakeCoordinationError(errors::Internal(
             absl::StrCat("Task (", task_name,
                          ") is already in error before the barrier "
                          "was called. Barrier Id: ",
@@ -1099,7 +1101,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   if (barrier->passed) {
     // Special hook for shutdown barrier to disconnect task.
     if (barrier_id == shutdown_barrier_id_) {
-      Status s = DisconnectTask(task);
+      absl::Status s = DisconnectTask(task);
       // Return any errors from the disconnect attempt, otherwise return the
       // barrier status outside of this hook.
       if (!s.ok()) {
@@ -1118,7 +1120,7 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   // Check if caller task is participating in the barrier.
   if (!barrier->tasks_at_barrier.contains(task)) {
     // Unexpected barrier call from a task not participating in the barrier.
-    Status error = MakeCoordinationError(errors::InvalidArgument(
+    absl::Status error = MakeCoordinationError(errors::InvalidArgument(
         absl::StrCat("A non-participating task (", GetTaskName(task),
                      ") called the barrier: ", barrier_id)));
     PassBarrier(barrier_id, error, barrier);
@@ -1128,8 +1130,9 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
   // Check if task args are specified consistently across barrier calls.
   if (!ValidateTaskArgs(participating_tasks, barrier->tasks_at_barrier,
                         cluster_state_.size())) {
-    Status error = MakeCoordinationError(errors::InvalidArgument(absl::StrCat(
-        "Conflicting tasks specified for the same barrier: ", barrier_id)));
+    absl::Status error =
+        MakeCoordinationError(errors::InvalidArgument(absl::StrCat(
+            "Conflicting tasks specified for the same barrier: ", barrier_id)));
     PassBarrier(barrier_id, error, barrier);
     return;
   }
@@ -1141,13 +1144,13 @@ void CoordinationServiceStandaloneImpl::BarrierAsync(
     --barrier->num_pending_tasks;
 
     if (barrier->num_pending_tasks == 0) {
-      PassBarrier(barrier_id, OkStatus(), barrier);
+      PassBarrier(barrier_id, absl::OkStatus(), barrier);
       return;
     }
   }
 }
 
-Status CoordinationServiceStandaloneImpl::CancelBarrier(
+absl::Status CoordinationServiceStandaloneImpl::CancelBarrier(
     const std::string& barrier_id, const CoordinatedTask& task) {
   mutex_lock l(state_mu_);
   auto [it, inserted] = barriers_.try_emplace(barrier_id);
@@ -1165,17 +1168,17 @@ Status CoordinationServiceStandaloneImpl::CancelBarrier(
   }
 
   // Cancel barrier.
-  Status cancelled = MakeCoordinationError(errors::Cancelled(absl::StrCat(
+  absl::Status cancelled = MakeCoordinationError(errors::Cancelled(absl::StrCat(
       "Barrier (", barrier_id, ") is cancelled by task: ", GetTaskName(task))));
   PassBarrier(barrier_id, cancelled, barrier);
 
   VLOG(3) << "Barrier (" << barrier_id << ") is cancelled.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Mark barrier as passed.
 void CoordinationServiceStandaloneImpl::PassBarrier(
-    absl::string_view barrier_id, Status result, BarrierState* barrier) {
+    absl::string_view barrier_id, absl::Status result, BarrierState* barrier) {
   barrier->passed = true;
   barrier->result = result;
   VLOG(3) << "Barrier(" << barrier_id << ") has passed with status: " << result;
@@ -1201,14 +1204,14 @@ void CoordinationServiceStandaloneImpl::PassBarrier(
                     "crashed early or too slow / hanging. Check the logs for "
                     "an earlier error to identify the root cause.";
     }
-    Status shutdown_error = MakeCoordinationError(errors::Internal(
+    absl::Status shutdown_error = MakeCoordinationError(errors::Internal(
         absl::StrCat("Shutdown barrier has been passed with status: '",
                      barrier->result.ToString(),
                      "', but this task is not at the barrier yet.")));
     for (const auto& [task, at_barrier] : barrier->tasks_at_barrier) {
       if (at_barrier) {
         // Disconnect tasks that reached the barrier.
-        Status disconnect_status = DisconnectTask(task);
+        absl::Status disconnect_status = DisconnectTask(task);
         if (!disconnect_status.ok()) {
           LOG(ERROR) << disconnect_status;
         }
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
index 64c96ce009777c..b82261e6d30bc6 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service.h
@@ -116,8 +116,8 @@ class CoordinationServiceInterface {
   //   - InvalidArgument: Unexpected task request.
   //   - Aborted: (1) task is in error state, or (2) task is in connected state
   //       with a different incarnation, indicating that it restarted.
-  virtual Status RegisterTask(const tensorflow::CoordinatedTask& task,
-                              uint64_t incarnation) = 0;
+  virtual absl::Status RegisterTask(const tensorflow::CoordinatedTask& task,
+                                    uint64_t incarnation) = 0;
 
   // Wait for all tasks to be up and running, and register local device
   // info. The callback is invoked when all tasks are up and registered, or some
@@ -141,16 +141,16 @@ class CoordinationServiceInterface {
   // Possible service errors:
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task has already disconnected.
-  virtual Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
+  virtual absl::Status ResetTask(const tensorflow::CoordinatedTask& task) = 0;
 
   // Update the heartbeat timestamp of a task. This should only be invoked on
   // the leader of the cluster.
-  virtual Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
-                                 uint64_t incarnation) = 0;
+  virtual absl::Status RecordHeartbeat(const tensorflow::CoordinatedTask& task,
+                                       uint64_t incarnation) = 0;
 
   // Set a task in error state permanently.
-  virtual Status ReportTaskError(const tensorflow::CoordinatedTask& task,
-                                 Status error) = 0;
+  virtual absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
+                                       absl::Status error) = 0;
 
   // Get the state and the error status of the tasks.
   virtual std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
@@ -159,8 +159,8 @@ class CoordinationServiceInterface {
   // Insert a configuration key-value in the coordination service.
   // For now, a key-value can only be inserted once and cannot be updated.
   // The key-values are not persisted and will be lost if the leader fails.
-  virtual Status InsertKeyValue(const std::string& key,
-                                const std::string& value) = 0;
+  virtual absl::Status InsertKeyValue(const std::string& key,
+                                      const std::string& value) = 0;
 
   // Get a configuration key-value from the coordination service. The `done`
   // callback is invoked when the key-value becomes available.
@@ -181,7 +181,7 @@ class CoordinationServiceInterface {
 
   // Delete configuration key-value. If key is a directory, recursively clean
   // up all key-values under the directory.
-  virtual Status DeleteKeyValue(const std::string& key) = 0;
+  virtual absl::Status DeleteKeyValue(const std::string& key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -223,8 +223,9 @@ class CoordinationServiceInterface {
   // CANCELLED error status.
   // Possible service errors:
   //   - FailedPrecondition: Barrier has already been passed.
-  virtual Status CancelBarrier(const std::string& barrier_id,
-                               const tensorflow::CoordinatedTask& task) = 0;
+  virtual absl::Status CancelBarrier(
+      const std::string& barrier_id,
+      const tensorflow::CoordinatedTask& task) = 0;
 
  private:
   friend class CoordinationServiceRpcHandler;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index 275b8b18575d56..5f65f8e861bd92 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -72,30 +72,30 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
  public:
   CoordinationServiceAgentImpl() = default;
   ~CoordinationServiceAgentImpl() override {
-    Status s = Shutdown();
+    absl::Status s = Shutdown();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
-  Status Initialize(Env* env, std::string_view job_name, int task_id,
-                    const CoordinationServiceConfig& configs,
-                    std::unique_ptr<CoordinationClient> leader_client,
-                    StatusCallback error_fn) override;
-  Status Initialize(Env* env, const CoordinatedTask& task,
-                    const CoordinationServiceConfig& configs,
-                    std::unique_ptr<CoordinationClient> leader_client,
-                    StatusCallback error_fn) override;
+  absl::Status Initialize(Env* env, std::string_view job_name, int task_id,
+                          const CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn) override;
+  absl::Status Initialize(Env* env, const CoordinatedTask& task,
+                          const CoordinationServiceConfig& configs,
+                          std::unique_ptr<CoordinationClient> leader_client,
+                          StatusCallback error_fn) override;
   bool IsInitialized() override;
   bool IsConnected() override;
   bool IsError() override;
 
-  Status Connect() override;
-  Status WaitForAllTasks(const DeviceInfo& local_devices) override;
+  absl::Status Connect() override;
+  absl::Status WaitForAllTasks(const DeviceInfo& local_devices) override;
   const DeviceInfo& GetClusterDeviceInfo() override;
   absl::StatusOr<CoordinatedTask> GetOwnTask() override;
   absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> GetTaskState(
       const std::vector<CoordinatedTask>& task) override;
-  Status ReportError(const Status& error) override;
-  Status Shutdown() override;
-  Status Reset() override;
+  absl::Status ReportError(const absl::Status& error) override;
+  absl::Status Shutdown() override;
+  absl::Status Reset() override;
 
   absl::StatusOr<std::string> GetKeyValue(std::string_view key) override;
   absl::StatusOr<std::string> GetKeyValue(std::string_view key,
@@ -107,31 +107,34 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
       std::string_view key) override;
   void GetKeyValueDirAsync(std::string_view key,
                            StatusOrValueDirCallback done) override;
-  Status InsertKeyValue(std::string_view key, std::string_view value) override;
-  Status DeleteKeyValue(std::string_view key) override;
-  Status UpdateKeyValue(std::string_view key, std::string_view value) override;
-
-  Status StartWatchKey(std::string_view key,
-                       ChangedKeyValuesCallback on_change) override;
-  Status StopWatchKey(std::string_view key) override;
-  Status WaitAtBarrier(std::string_view barrier_id, absl::Duration timeout,
-                       const std::vector<CoordinatedTask>& tasks) override;
+  absl::Status InsertKeyValue(std::string_view key,
+                              std::string_view value) override;
+  absl::Status DeleteKeyValue(std::string_view key) override;
+  absl::Status UpdateKeyValue(std::string_view key,
+                              std::string_view value) override;
+
+  absl::Status StartWatchKey(std::string_view key,
+                             ChangedKeyValuesCallback on_change) override;
+  absl::Status StopWatchKey(std::string_view key) override;
+  absl::Status WaitAtBarrier(
+      std::string_view barrier_id, absl::Duration timeout,
+      const std::vector<CoordinatedTask>& tasks) override;
   void WaitAtBarrierAsync(std::string_view barrier_id, absl::Duration timeout,
                           const std::vector<CoordinatedTask>& tasks,
                           StatusCallback done) override;
-  Status CancelBarrier(std::string_view barrier_id) override;
+  absl::Status CancelBarrier(std::string_view barrier_id) override;
   void CancelBarrierAsync(std::string_view barrier_id,
                           StatusCallback done) override;
 
   absl::StatusOr<Env*> GetEnv() override;
 
  protected:
-  void SetError(const Status& error) override;
-  Status ActivateWatch(std::string_view key,
-                       const std::map<std::string, std::string>&) override;
+  void SetError(const absl::Status& error) override;
+  absl::Status ActivateWatch(
+      std::string_view key, const std::map<std::string, std::string>&) override;
   // Returns an error if agent is not running. If `allow_disconnected` is true,
   // returns OK even if the agent is in DISCONNECTED state.
-  Status ValidateRunningAgent(bool allow_disconnected = false);
+  absl::Status ValidateRunningAgent(bool allow_disconnected = false);
   void StopHeartbeat();
 
  private:
@@ -144,7 +147,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   mutable mutex state_mu_;
   CoordinatedTaskState state_ TF_GUARDED_BY(state_mu_) =
       CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
-  Status status_ TF_GUARDED_BY(state_mu_) = OkStatus();
+  absl::Status status_ TF_GUARDED_BY(state_mu_) = absl::OkStatus();
   // Note: this set grows without bounds. For now, this is okay as most users
   // require < 100 barriers. If there is a use case that requires many barriers,
   // consider using a monotonic sequence number to track instead.
@@ -166,7 +169,7 @@ class CoordinationServiceAgentImpl : public CoordinationServiceAgent {
   void operator=(const CoordinationServiceAgentImpl&) = delete;
 };
 
-Status CoordinationServiceAgentImpl::Initialize(
+absl::Status CoordinationServiceAgentImpl::Initialize(
     Env* env, std::string_view job_name, int task_id,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
@@ -177,7 +180,7 @@ Status CoordinationServiceAgentImpl::Initialize(
   return Initialize(env, task, configs, std::move(leader_client), error_fn);
 }
 
-Status CoordinationServiceAgentImpl::Initialize(
+absl::Status CoordinationServiceAgentImpl::Initialize(
     Env* env, const CoordinatedTask& task,
     const CoordinationServiceConfig& configs,
     std::unique_ptr<CoordinationClient> leader_client,
@@ -203,7 +206,7 @@ Status CoordinationServiceAgentImpl::Initialize(
   }
   error_fn_ = error_fn;
   state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool CoordinationServiceAgentImpl::IsInitialized() {
@@ -230,7 +233,7 @@ void CoordinationServiceAgentImpl::StopHeartbeat() {
   heartbeat_thread_.reset();
 }
 
-Status CoordinationServiceAgentImpl::Connect() {
+absl::Status CoordinationServiceAgentImpl::Connect() {
   VLOG(3) << "Agent has started trying to Connect().";
   {
     mutex_lock l(state_mu_);
@@ -239,7 +242,8 @@ Status CoordinationServiceAgentImpl::Connect() {
           "Coordination service agent is not in DISCONNECTED state."));
     }
   }
-  Status connect_status = absl::UnknownError("Connection not attempted yet.");
+  absl::Status connect_status =
+      absl::UnknownError("Connection not attempted yet.");
   RegisterTaskRequest request;
   *request.mutable_source_task() = task_;
   request.set_incarnation(incarnation_id_);
@@ -261,7 +265,7 @@ Status CoordinationServiceAgentImpl::Connect() {
     call_opts.SetTimeout(absl::ToInt64Milliseconds(deadline - absl::Now()));
     absl::Notification n;
     leader_client_->RegisterTaskAsync(
-        &call_opts, &request, &response, [&](Status s) {
+        &call_opts, &request, &response, [&](absl::Status s) {
           if (s.ok()) {
             leader_incarnation_ = response.leader_incarnation();
             {
@@ -315,13 +319,13 @@ Status CoordinationServiceAgentImpl::Connect() {
         call_opts.SetTimeout(heartbeat_interval_ms);
 
         while (true) {
-          Status status;
+          absl::Status status;
           absl::Notification n;
           // Heartbeat RPC implementation automatically retries to tolerate
           // transient network failures.
           VLOG(10) << "HeartbeatRequest: " << request.DebugString();
           leader_client_->HeartbeatAsync(&call_opts, &request, &response,
-                                         [&](Status s) {
+                                         [&](absl::Status s) {
                                            status = s;
                                            n.Notify();
                                          });
@@ -355,12 +359,12 @@ Status CoordinationServiceAgentImpl::Connect() {
           }
         }
       }));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::WaitForAllTasks(
+absl::Status CoordinationServiceAgentImpl::WaitForAllTasks(
     const DeviceInfo& local_devices) {
-  Status agent_running_status = ValidateRunningAgent();
+  absl::Status agent_running_status = ValidateRunningAgent();
   if (!agent_running_status.ok()) {
     return agent_running_status;
   }
@@ -369,12 +373,13 @@ Status CoordinationServiceAgentImpl::WaitForAllTasks(
   *request.mutable_device_info() = local_devices;
   VLOG(3) << "WaitForAllTasksRequest: " << request.DebugString();
   WaitForAllTasksResponse response;
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->WaitForAllTasksAsync(&request, &response, [&](Status s) {
-    status = s;
-    n.Notify();
-  });
+  leader_client_->WaitForAllTasksAsync(&request, &response,
+                                       [&](absl::Status s) {
+                                         status = s;
+                                         n.Notify();
+                                       });
   n.WaitForNotification();
   if (!status.ok()) {
     VLOG(3) << "WaitForAllTasksResponse: " << status;
@@ -383,7 +388,7 @@ Status CoordinationServiceAgentImpl::WaitForAllTasks(
   }
   VLOG(3) << "WaitForAllTasksResponse: " << response.DebugString();
   cluster_devices_ = response.device_info();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const DeviceInfo& CoordinationServiceAgentImpl::GetClusterDeviceInfo() {
@@ -407,21 +412,23 @@ CoordinationServiceAgentImpl::GetTaskState(
   GetTaskStateResponse response;
   absl::Notification n;
   absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> result;
-  leader_client_->GetTaskStateAsync(&request, &response, [&](const Status& s) {
-    if (s.ok()) {
-      result = std::vector<CoordinatedTaskStateInfo>(
-          std::make_move_iterator(response.task_state().begin()),
-          std::make_move_iterator(response.task_state().end()));
-    } else {
-      result = s;
-    }
-    n.Notify();
-  });
+  leader_client_->GetTaskStateAsync(
+      &request, &response, [&](const absl::Status& s) {
+        if (s.ok()) {
+          result = std::vector<CoordinatedTaskStateInfo>(
+              std::make_move_iterator(response.task_state().begin()),
+              std::make_move_iterator(response.task_state().end()));
+        } else {
+          result = s;
+        }
+        n.Notify();
+      });
   n.WaitForNotification();
   return result;
 }
 
-Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
+absl::Status CoordinationServiceAgentImpl::ReportError(
+    const absl::Status& error) {
   {
     mutex_lock l(state_mu_);
     if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
@@ -444,24 +451,26 @@ Status CoordinationServiceAgentImpl::ReportError(const Status& error) {
   ReportErrorToServiceResponse response;
 
   absl::Notification n;
-  leader_client_->ReportErrorToServiceAsync(&request, &response, [&](Status s) {
-    VLOG(5) << "ReportErrorToServiceResponse: " << s;
-    if (!s.ok()) {
-      LOG(ERROR) << "Encountered another error when reporting error to "
-                    "coordination service: "
-                 << s
-                 << "\nThis is usually caused by an earlier error during "
-                    "execution. Check the logs (this task or the leader) for "
-                    "an earlier error to debug further.";
-    }
-    n.Notify();
-  });
+  leader_client_->ReportErrorToServiceAsync(
+      &request, &response, [&](absl::Status s) {
+        VLOG(5) << "ReportErrorToServiceResponse: " << s;
+        if (!s.ok()) {
+          LOG(ERROR)
+              << "Encountered another error when reporting error to "
+                 "coordination service: "
+              << s
+              << "\nThis is usually caused by an earlier error during "
+                 "execution. Check the logs (this task or the leader) for "
+                 "an earlier error to debug further.";
+        }
+        n.Notify();
+      });
   n.WaitForNotification();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::Shutdown() {
-  Status status = OkStatus();
+absl::Status CoordinationServiceAgentImpl::Shutdown() {
+  absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
     mutex_lock l(state_mu_);
@@ -482,7 +491,7 @@ Status CoordinationServiceAgentImpl::Shutdown() {
 
     absl::Notification n;
     leader_client_->ShutdownTaskAsync(&call_opts, &request, &response,
-                                      [&status, &n](Status s) {
+                                      [&status, &n](absl::Status s) {
                                         status = s;
                                         n.Notify();
                                       });
@@ -524,7 +533,7 @@ Status CoordinationServiceAgentImpl::Shutdown() {
   return status;
 }
 
-Status CoordinationServiceAgentImpl::Reset() {
+absl::Status CoordinationServiceAgentImpl::Reset() {
   {
     mutex_lock l(state_mu_);
     if (state_ != CoordinatedTaskState::TASKSTATE_ERROR) {
@@ -538,12 +547,13 @@ Status CoordinationServiceAgentImpl::Reset() {
   VLOG(3) << "ResetTaskRequest: " << request.DebugString();
   ResetTaskResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->ResetTaskAsync(&request, &response, [&status, &n](Status s) {
-    status = s;
-    n.Notify();
-  });
+  leader_client_->ResetTaskAsync(&request, &response,
+                                 [&status, &n](absl::Status s) {
+                                   status = s;
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
   VLOG(3) << "ResetTaskResponse: " << status;
   if (!status.ok()) {
@@ -609,7 +619,7 @@ std::shared_ptr<CallOptions> CoordinationServiceAgentImpl::GetKeyValueAsync(
   leader_client_->GetKeyValueAsync(
       call_opts.get(), request.get(), response.get(),
       [call_opts, request, response, done = std::move(done),
-       &cm = cancellation_manager_, token](const Status& s) {
+       &cm = cancellation_manager_, token](const absl::Status& s) {
         // RPC call has completed (no longer needs to be cancelled if agent is
         // destroyed).
         cm.TryDeregisterCallback(token);
@@ -635,7 +645,7 @@ absl::StatusOr<std::string> CoordinationServiceAgentImpl::TryGetKeyValue(
   VLOG(3) << "TryGetKeyValueRequest: " << request.DebugString();
   TryGetKeyValueResponse response;
   leader_client_->TryGetKeyValueAsync(
-      &request, &response, [&](const Status& s) {
+      &request, &response, [&](const absl::Status& s) {
         if (s.ok()) {
           result = response.kv().value();
           VLOG(3) << "TryGetKeyValueResponse: " << result.value();
@@ -673,7 +683,7 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
   auto response = std::make_shared<GetKeyValueDirResponse>();
   leader_client_->GetKeyValueDirAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         if (!s.ok()) {
           done(s);
           VLOG(3) << "GetKeyValueDirResponse: " << s;
@@ -687,17 +697,17 @@ void CoordinationServiceAgentImpl::GetKeyValueDirAsync(
       });
 }
 
-Status CoordinationServiceAgentImpl::InsertKeyValue(std::string_view key,
-                                                    std::string_view value) {
+absl::Status CoordinationServiceAgentImpl::InsertKeyValue(
+    std::string_view key, std::string_view value) {
   InsertKeyValueRequest request;
   request.mutable_kv()->set_key(key.data(), key.size());
   request.mutable_kv()->set_value(value.data(), value.size());
   VLOG(3) << "InsertKeyValueRequest: " << request.DebugString();
   InsertKeyValueResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->InsertKeyValueAsync(&request, &response, [&](Status s) {
+  leader_client_->InsertKeyValueAsync(&request, &response, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
@@ -706,43 +716,44 @@ Status CoordinationServiceAgentImpl::InsertKeyValue(std::string_view key,
   return status;
 }
 
-Status CoordinationServiceAgentImpl::DeleteKeyValue(std::string_view key) {
+absl::Status CoordinationServiceAgentImpl::DeleteKeyValue(
+    std::string_view key) {
   DeleteKeyValueRequest request;
   request.set_key(key.data(), key.size());
   request.set_is_directory(true);
   VLOG(3) << "DeleteKeyValueRequest: " << request.DebugString();
   DeleteKeyValueResponse response;
 
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  leader_client_->DeleteKeyValueAsync(&request, &response, [&](Status s) {
+  leader_client_->DeleteKeyValueAsync(&request, &response, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
   n.WaitForNotification();
   VLOG(3) << "DeleteKeyValueResponse " << status;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status CoordinationServiceAgentImpl::UpdateKeyValue(std::string_view key,
-                                                    std::string_view value) {
+absl::Status CoordinationServiceAgentImpl::UpdateKeyValue(
+    std::string_view key, std::string_view value) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::UpdateKeyValue is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::StartWatchKey(
+absl::Status CoordinationServiceAgentImpl::StartWatchKey(
     std::string_view key,
     CoordinationServiceAgentImpl::ChangedKeyValuesCallback on_change) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StartWatchKey is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
+absl::Status CoordinationServiceAgentImpl::StopWatchKey(std::string_view key) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::StopWatchKey is not implemented."));
 }
 
-void CoordinationServiceAgentImpl::SetError(const Status& error) {
+void CoordinationServiceAgentImpl::SetError(const absl::Status& error) {
   assert(!error.ok());
   mutex_lock l(state_mu_);
   if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
@@ -753,18 +764,18 @@ void CoordinationServiceAgentImpl::SetError(const Status& error) {
   error_fn_(error);
 }
 
-Status CoordinationServiceAgentImpl::ActivateWatch(
+absl::Status CoordinationServiceAgentImpl::ActivateWatch(
     std::string_view key, const std::map<std::string, std::string>& kvs) {
   return MakeCoordinationError(absl::UnimplementedError(
       "CoordinationServiceAgent::ActivateWatch is not implemented."));
 }
 
-Status CoordinationServiceAgentImpl::WaitAtBarrier(
+absl::Status CoordinationServiceAgentImpl::WaitAtBarrier(
     std::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks) {
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  WaitAtBarrierAsync(barrier_id, timeout, tasks, [&](Status s) {
+  WaitAtBarrierAsync(barrier_id, timeout, tasks, [&](absl::Status s) {
     status = s;
     n.Notify();
   });
@@ -775,7 +786,7 @@ Status CoordinationServiceAgentImpl::WaitAtBarrier(
 void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
     std::string_view barrier_id, absl::Duration timeout,
     const std::vector<CoordinatedTask>& tasks, StatusCallback done) {
-  Status agent_running_status =
+  absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
   if (!agent_running_status.ok()) {
     done(agent_running_status);
@@ -801,17 +812,17 @@ void CoordinationServiceAgentImpl::WaitAtBarrierAsync(
   VLOG(3) << "WaitAtBarrierRequest: " << request->DebugString();
   leader_client_->BarrierAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         done(s);
         VLOG(3) << "WaitAtBarrierResponse: " << s;
       });
 }
 
-Status CoordinationServiceAgentImpl::CancelBarrier(
+absl::Status CoordinationServiceAgentImpl::CancelBarrier(
     std::string_view barrier_id) {
-  Status status;
+  absl::Status status;
   absl::Notification n;
-  CancelBarrierAsync(barrier_id, [&](const Status& s) {
+  CancelBarrierAsync(barrier_id, [&](const absl::Status& s) {
     status = s;
     n.Notify();
   });
@@ -821,7 +832,7 @@ Status CoordinationServiceAgentImpl::CancelBarrier(
 
 void CoordinationServiceAgentImpl::CancelBarrierAsync(
     std::string_view barrier_id, StatusCallback done) {
-  Status agent_running_status =
+  absl::Status agent_running_status =
       ValidateRunningAgent(/*allow_disconnected=*/true);
   if (!agent_running_status.ok()) {
     done(agent_running_status);
@@ -834,26 +845,26 @@ void CoordinationServiceAgentImpl::CancelBarrierAsync(
   VLOG(3) << "CancelBarrierRequest: " << request->DebugString();
   leader_client_->CancelBarrierAsync(
       request.get(), response.get(),
-      [request, response, done = std::move(done)](const Status& s) {
+      [request, response, done = std::move(done)](const absl::Status& s) {
         done(s);
         VLOG(3) << "CancelBarrierResponse: " << s;
       });
 }
 
 // Returns an error if agent is not running.
-Status CoordinationServiceAgentImpl::ValidateRunningAgent(
+absl::Status CoordinationServiceAgentImpl::ValidateRunningAgent(
     bool allow_disconnected) {
   mutex_lock l(state_mu_);
   switch (state_) {
     case CoordinatedTaskState::TASKSTATE_CONNECTED:
-      return OkStatus();
+      return absl::OkStatus();
 
     case CoordinatedTaskState::TASKSTATE_UNINITIALIZED:
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently UNINITIALIZED."));
 
     case CoordinatedTaskState::TASKSTATE_DISCONNECTED:
-      if (allow_disconnected) return OkStatus();
+      if (allow_disconnected) return absl::OkStatus();
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently DISCONNECTED."));
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
index 9ba405c84872eb..6c31eccffd10f0 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent.h
@@ -75,12 +75,12 @@ class CoordinationServiceAgent {
   virtual ~CoordinationServiceAgent() = default;
 
   // Initialize coordination service agent.
-  virtual Status Initialize(
+  virtual absl::Status Initialize(
       tsl::Env* env, std::string_view job_name, int task_id,
       const tensorflow::CoordinationServiceConfig& configs,
       std::unique_ptr<CoordinationClient> leader_client,
       StatusCallback error_fn) = 0;
-  virtual Status Initialize(
+  virtual absl::Status Initialize(
       tsl::Env* env, const tensorflow::CoordinatedTask& task,
       const tensorflow::CoordinationServiceConfig& configs,
       std::unique_ptr<CoordinationClient> leader_client,
@@ -105,14 +105,14 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task registration
   //   - Aborted: Duplicate task registration (agent will retry connecting until
   //              the configured timeout)
-  virtual Status Connect() = 0;
+  virtual absl::Status Connect() = 0;
 
   // Wait for all tasks to be up and registered. The call blocks until all tasks
   // in the cluster are up, or some error occurs.
   // Possible service errors:
   //   - FailedPrecondition: Agent is not in CONNECTED state.
   //   - InvalidArgument: Unexpected task request
-  virtual Status WaitForAllTasks(
+  virtual absl::Status WaitForAllTasks(
       const tensorflow::DeviceInfo& local_devices) = 0;
 
   // Get the device attributes of tasks from remote tasks in the cluster.
@@ -139,7 +139,7 @@ class CoordinationServiceAgent {
   // Possible service errors:
   //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
-  virtual Status ReportError(const Status& error) = 0;
+  virtual absl::Status ReportError(const absl::Status& error) = 0;
 
   // Shuts down by disconnecting from the service. Should only be called if
   // agent is connected and no further agent calls (except the destructor) are
@@ -151,14 +151,14 @@ class CoordinationServiceAgent {
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: Task was in error state (note: agent is still
   //                         shut down forcefully).
-  virtual Status Shutdown() = 0;
+  virtual absl::Status Shutdown() = 0;
 
   // Disconnect from the service, and clean up the internal error status.
   // Possible service errors:
   //   - InvalidArgument: Unexpected task request.
   //   - FailedPrecondition: task is not in error state/has already
   //       disconnected.
-  virtual Status Reset() = 0;
+  virtual absl::Status Reset() = 0;
 
   // Key-value store API.
   // The agent does not need to be connected to utilize the key-value store.
@@ -193,21 +193,21 @@ class CoordinationServiceAgent {
 
   // Insert config key-value to the service.
   //   - AlreadyExists: key is already set.
-  virtual Status InsertKeyValue(std::string_view key,
-                                std::string_view value) = 0;
+  virtual absl::Status InsertKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
 
   // Delete config keys in the coordination service.
-  virtual Status DeleteKeyValue(std::string_view key) = 0;
+  virtual absl::Status DeleteKeyValue(std::string_view key) = 0;
 
   // Update the value of a config key.
-  virtual Status UpdateKeyValue(std::string_view key,
-                                std::string_view value) = 0;
+  virtual absl::Status UpdateKeyValue(std::string_view key,
+                                      std::string_view value) = 0;
 
   // Register a callback that will be invoked when the key or keys under the key
   // directory are changed (inserted, deleted, or updated).
-  virtual Status StartWatchKey(std::string_view key,
-                               ChangedKeyValuesCallback on_change) = 0;
-  virtual Status StopWatchKey(std::string_view key) = 0;
+  virtual absl::Status StartWatchKey(std::string_view key,
+                                     ChangedKeyValuesCallback on_change) = 0;
+  virtual absl::Status StopWatchKey(std::string_view key) = 0;
 
   // Blocks until all (or a subset of) tasks are at the barrier or the barrier
   // fails.
@@ -241,7 +241,7 @@ class CoordinationServiceAgent {
   //       list of participating tasks.
   //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state. Or the
   //       same barrier_id was already used previously.
-  virtual Status WaitAtBarrier(
+  virtual absl::Status WaitAtBarrier(
       std::string_view barrier_id, absl::Duration timeout,
       const std::vector<tensorflow::CoordinatedTask>& tasks) = 0;
 
@@ -255,7 +255,7 @@ class CoordinationServiceAgent {
   // CANCELLED error status.
   // Possible service errors:
   //   - FailedPrecondition: Barrier has already been passed.
-  virtual Status CancelBarrier(std::string_view barrier_id) = 0;
+  virtual absl::Status CancelBarrier(std::string_view barrier_id) = 0;
   virtual void CancelBarrierAsync(std::string_view barrier_id,
                                   StatusCallback done) = 0;
 
@@ -266,11 +266,11 @@ class CoordinationServiceAgent {
   // Set the service agent to error status and invoke the error callback.
   // Note: different from ReportError, this does not report the error status to
   // remote coordination service.
-  virtual void SetError(const Status& error) = 0;
+  virtual void SetError(const absl::Status& error) = 0;
 
   // Activate the key-value callback watch.
-  virtual Status ActivateWatch(std::string_view,
-                               const std::map<std::string, std::string>&) = 0;
+  virtual absl::Status ActivateWatch(
+      std::string_view, const std::map<std::string, std::string>&) = 0;
 
  private:
   friend class CoordinationServiceRpcHandler;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 85de73b5bf323b..60b18a033dd343 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -158,19 +158,19 @@ class CoordinationServiceAgentTest : public ::testing::Test {
  public:
   void SetUp() override {
     ON_CALL(*client_, RegisterTaskAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, HeartbeatAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ShutdownTaskAsync(_, _, _, _))
-        .WillByDefault(InvokeArgument<3>(OkStatus()));
+        .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ReportErrorToServiceAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, ResetTaskAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, BarrierAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, GetTaskStateAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(OkStatus()));
+        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
   }
 
   // Should be called after mocking service responses, before testing the agent.
@@ -179,7 +179,7 @@ class CoordinationServiceAgentTest : public ::testing::Test {
     TF_ASSERT_OK(agent_->Initialize(
         Env::Default(), /*job_name=*/"test_job",
         /*task_id=*/0, config, std::move(client_),
-        /*error_fn=*/[](Status s) {
+        /*error_fn=*/[](absl::Status s) {
           LOG(ERROR) << "Coordination agent is set to error: " << s;
         }));
   }
@@ -208,7 +208,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -228,7 +228,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValue_WithTimeout_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -281,7 +281,7 @@ TEST_F(CoordinationServiceAgentTest,
   auto kv = owned_response->mutable_kv();
   kv->set_key(test_key);
   kv->set_value(test_value);
-  owned_done(OkStatus());
+  owned_done(absl::OkStatus());
   // No explicit test, but used to verify there is no stack-use-after-return
   // or other memory-related errors.
 }
@@ -313,7 +313,7 @@ TEST_F(CoordinationServiceAgentTest,
                   auto kv = owned_response->mutable_kv();
                   kv->set_key(test_key);
                   kv->set_value(test_value);
-                  owned_done(OkStatus());
+                  owned_done(absl::OkStatus());
                 }));
           }));
   InitializeAgent();
@@ -336,7 +336,7 @@ TEST_F(CoordinationServiceAgentTest, CancelGetKeyValue_Success) {
           }));
   InitializeAgent();
 
-  Status status;
+  absl::Status status;
   std::shared_ptr<CallOptions> get_kv_call_opts = agent_->GetKeyValueAsync(
       test_key, [&status](const absl::StatusOr<std::string>& result) {
         status = result.status();
@@ -359,7 +359,7 @@ TEST_F(CoordinationServiceAgentTest, TryGetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), TryGetKeyValueAsync(_, _, _))
       .WillByDefault(DoAll(SetArgPointee<1>(mocked_response),
-                           InvokeArgument<2>(OkStatus())));
+                           InvokeArgument<2>(absl::OkStatus())));
 
   // Initialize coordination agent.
   InitializeAgent();
@@ -379,7 +379,7 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValueDir_Simple_Success) {
   *mocked_response.mutable_kv() = {test_values.begin(), test_values.end()};
   ON_CALL(*GetClient(), GetKeyValueDirAsync(_, _, _))
       .WillByDefault(DoAll(SetArgPointee<1>(mocked_response),
-                           InvokeArgument<2>(OkStatus())));
+                           InvokeArgument<2>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -396,7 +396,7 @@ TEST_F(CoordinationServiceAgentTest, ShutdownInErrorShouldReturnError) {
   TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
 
   // Shutdown should return error.
-  Status s = agent_->Shutdown();
+  absl::Status s = agent_->Shutdown();
 
   EXPECT_TRUE(absl::IsFailedPrecondition(s));
 }
@@ -428,14 +428,14 @@ TEST_F(CoordinationServiceAgentTest, ResetCanBeRetried) {
   // Mock reset error failing for the first time.
   EXPECT_CALL(*GetClient(), ResetTaskAsync(_, _, _))
       .WillOnce(InvokeArgument<2>(absl::InternalError("Reset error")))
-      .WillOnce(InvokeArgument<2>(OkStatus()));
+      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
   // Connect coordination agent and set it to error.
   InitializeAgent();
   TF_ASSERT_OK(agent_->Connect());
   TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
 
   // Reset error fails for the first time.
-  Status reset_status = agent_->Reset();
+  absl::Status reset_status = agent_->Reset();
   EXPECT_TRUE(absl::IsInternal(reset_status));
 
   // Agent should be able to attempt resetting again.
@@ -497,7 +497,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldBeRetried) {
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")))
       .WillOnce(
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")))
-      .WillOnce(InvokeArgument<3>(OkStatus()));
+      .WillOnce(InvokeArgument<3>(absl::OkStatus()));
   InitializeAgent();
 
   TF_EXPECT_OK(agent_->Connect());
@@ -515,7 +515,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldFailEventually) {
       absl::ToInt64Milliseconds(absl::Seconds(3)));
   InitializeAgent(config);
 
-  Status s = agent_->Connect();
+  absl::Status s = agent_->Connect();
 
   EXPECT_TRUE(absl::IsAborted(s));
 }
@@ -526,7 +526,7 @@ TEST_F(CoordinationServiceAgentTest, Connect_InternalErrorShouldBeRetried) {
           absl::InternalError("Coordination service is not enabled.")))
       .WillOnce(InvokeArgument<3>(
           absl::InternalError("Coordination service is not enabled.")))
-      .WillOnce(InvokeArgument<3>(OkStatus()));
+      .WillOnce(InvokeArgument<3>(absl::OkStatus()));
   InitializeAgent();
 
   TF_EXPECT_OK(agent_->Connect());
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
index 140c85df4403b0..2f2f87687d6feb 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_rpc_handler.cc
@@ -78,13 +78,13 @@ void CoordinationServiceRpcHandler::HeartbeatAsync(
   const CoordinatedTask& task = request->source_task();
   const uint64_t incarnation = request->incarnation();
   const uint64_t leader_incarnation = service_->GetServiceIncarnation();
-  Status s = service_->RecordHeartbeat(task, incarnation);
+  absl::Status s = service_->RecordHeartbeat(task, incarnation);
   if (!s.ok()) {
     done(s);
     return;
   }
   response->set_leader_incarnation(leader_incarnation);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
@@ -98,7 +98,7 @@ void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
   }
   service_->WaitForAllTasks(
       request->source_task(), request->device_info(),
-      [response, service = service_, done = std::move(done)](Status s) {
+      [response, service = service_, done = std::move(done)](absl::Status s) {
         if (s.ok()) {
           *response->mutable_device_info() = service->ListClusterDevices();
         }
@@ -116,7 +116,7 @@ void CoordinationServiceRpcHandler::ShutdownTaskAsync(
     return;
   }
   service_->ShutdownTaskAsync(request->source_task(),
-                              [done](Status s) { done(s); });
+                              [done](absl::Status s) { done(s); });
 }
 
 void CoordinationServiceRpcHandler::ResetTaskAsync(
@@ -141,14 +141,15 @@ void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
     return;
   }
   const CoordinationServiceError& error_payload = request->error_payload();
-  Status error(static_cast<absl::StatusCode>(request->error_code()),
-               strings::StrCat("Error reported from /job:",
-                               error_payload.source_task().job_name(),
-                               "/task:", error_payload.source_task().task_id(),
-                               ": ", request->error_message()));
+  absl::Status error(
+      static_cast<absl::StatusCode>(request->error_code()),
+      strings::StrCat(
+          "Error reported from /job:", error_payload.source_task().job_name(),
+          "/task:", error_payload.source_task().task_id(), ": ",
+          request->error_message()));
   error = MakeCoordinationError(error, error_payload);
   agent_->SetError(error);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
@@ -163,8 +164,8 @@ void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
   done(service_->ReportTaskError(
       request->error_origin(),
       MakeCoordinationError(
-          Status{static_cast<absl::StatusCode>(request->error_code()),
-                 request->error_message()},
+          absl::Status{static_cast<absl::StatusCode>(request->error_code()),
+                       request->error_message()},
           request->error_origin(),
           /*is_reported_error=*/true)));
 }
@@ -182,7 +183,7 @@ void CoordinationServiceRpcHandler::GetTaskStateAsync(
       {request->source_task().begin(), request->source_task().end()});
   absl::c_move(result,
                RepeatedFieldBackInserter(response->mutable_task_state()));
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::InsertKeyValueAsync(
@@ -233,7 +234,7 @@ void CoordinationServiceRpcHandler::TryGetKeyValueAsync(
   }
   response->mutable_kv()->set_key(request->key());
   response->mutable_kv()->set_value(result.value());
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
@@ -249,7 +250,7 @@ void CoordinationServiceRpcHandler::GetKeyValueDirAsync(
       service_->GetKeyValueDir(request->directory_key());
   *response->mutable_kv() = {std::make_move_iterator(results.begin()),
                              std::make_move_iterator(results.end())};
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void CoordinationServiceRpcHandler::DeleteKeyValueAsync(
@@ -279,7 +280,7 @@ void CoordinationServiceRpcHandler::BarrierAsync(const BarrierRequest* request,
       request->barrier_id(),
       absl::Milliseconds(request->barrier_timeout_in_ms()),
       request->source_task(), tasks,
-      [done = std::move(done)](const Status& status) { done(status); });
+      [done = std::move(done)](const absl::Status& status) { done(status); });
 }
 
 void CoordinationServiceRpcHandler::CancelBarrierAsync(
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
index eaaa05d4b9813a..b111a6235c8cbf 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -80,7 +80,7 @@ class TestCoordinationClient : public CoordinationClient {
  public:
   TestCoordinationClient() = default;
 
-  Status GetStatus() {
+  absl::Status GetStatus() {
     mutex_lock l(mu_);
     return status_;
   }
@@ -88,7 +88,7 @@ class TestCoordinationClient : public CoordinationClient {
   void RegisterTaskAsync(CallOptions* opts, const RegisterTaskRequest* request,
                          RegisterTaskResponse* response,
                          StatusCallback done) override {
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   void ReportErrorToTaskAsync(CallOptions* call_opts,
@@ -96,9 +96,9 @@ class TestCoordinationClient : public CoordinationClient {
                               ReportErrorToTaskResponse* response,
                               StatusCallback done) override {
     mutex_lock l(mu_);
-    status_ = Status(static_cast<absl::StatusCode>(request->error_code()),
-                     request->error_message());
-    done(OkStatus());
+    status_ = absl::Status(static_cast<absl::StatusCode>(request->error_code()),
+                           request->error_message());
+    done(absl::OkStatus());
   }
 
 #define UNIMPLEMENTED(method)                                         \
@@ -134,7 +134,7 @@ class TestCoordinationClient : public CoordinationClient {
 
  private:
   mutex mu_;
-  Status status_ TF_GUARDED_BY(mu_);
+  absl::Status status_ TF_GUARDED_BY(mu_);
 };
 
 class TestCoordinationClientCache : public CoordinationClientCache {
@@ -183,7 +183,8 @@ class CoordinationBarrierTest : public ::testing::Test {
         Env::Default(), config, std::move(client_cache));
     // Register the tasks.
     for (int i = 0; i < num_tasks; ++i) {
-      Status s = coord_service_->RegisterTask(tasks_[i], /*incarnation=*/0);
+      absl::Status s =
+          coord_service_->RegisterTask(tasks_[i], /*incarnation=*/0);
       if (!s.ok()) {
         LOG(FATAL) << "RegisterTask() failed in CoordinationBarrierTest(): "
                    << s;
@@ -278,7 +279,7 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
 
   TF_ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Notification wait_for_all;
-  coord_service_->WaitForAllTasks(task_0_, {}, [&](Status s) {
+  coord_service_->WaitForAllTasks(task_0_, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     wait_for_all.Notify();
   });
@@ -286,7 +287,7 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   ASSERT_FALSE(wait_for_all.HasBeenNotified());
   TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   coord_service_->WaitForAllTasks(task_1_, {},
-                                  [&](Status s) { TF_ASSERT_OK(s); });
+                                  [&](absl::Status s) { TF_ASSERT_OK(s); });
   // All tasks have registered.
   wait_for_all.WaitForNotification();
 
@@ -341,19 +342,19 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   // Each coordinated task registers and waits for other tasks.
   absl::Notification register_chief;
   TF_ASSERT_OK(coord_service->RegisterTask(chief, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(chief, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(chief, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_chief.Notify();
   });
   absl::Notification register_task0;
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(task_0, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(task_0, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_task0.Notify();
   });
   absl::Notification register_task1;
   TF_ASSERT_OK(coord_service->RegisterTask(task_1, /*incarnation=*/0));
-  coord_service->WaitForAllTasks(task_1, {}, [&](Status s) {
+  coord_service->WaitForAllTasks(task_1, {}, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     register_task1.Notify();
   });
@@ -363,7 +364,8 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   register_task1.WaitForNotification();
 
   // Registering the evaluator task is unexpected
-  Status status = coord_service->RegisterTask(evaluator, /*incarnation=*/0);
+  absl::Status status =
+      coord_service->RegisterTask(evaluator, /*incarnation=*/0);
   EXPECT_TRUE(absl::IsInvalidArgument(status)) << status;
   EXPECT_TRUE(!status.message().empty());
 }
@@ -385,7 +387,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
   TF_ASSERT_OK(coord_service->RegisterTask(task_0, /*incarnation=*/0));
 
   // Registration should succeed since it is the same task.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   TF_EXPECT_OK(status) << status;
 }
@@ -407,7 +410,8 @@ TEST(CoordinationServiceTest,
   // Registration should fail since task already registered previously with a
   // different incarnation. Note that incarnation usually changes if an agent
   // restarts.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/1);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/1);
 
   EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
@@ -430,7 +434,8 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
       coord_service->ReportTaskError(task_0, errors::Internal("test_error")));
 
   // Registration should fail since task already registered previously.
-  const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
+  const absl::Status status =
+      coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
   EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
@@ -474,7 +479,7 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   TF_ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
   // Simulate task restart scenario: trying to register to cluster again.
-  Status s =
+  absl::Status s =
       coord_service_->RegisterTask(task_1_, /*incarnation=*/random::New64());
   EXPECT_TRUE(absl::IsAborted(s)) << s;
   // Aborted error is also propagated to other tasks in cluster.
@@ -664,7 +669,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   CoordinatedTask task_2;
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -686,10 +691,10 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   // Each task sends its device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](Status s) {
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
@@ -720,7 +725,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   CoordinatedTask task_2;
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -761,10 +766,10 @@ TEST(CoordinationServiceTest, ListClusterDevices_XlaDevice) {
   // Make sure that cluster device order is deterministic even if devices are
   // sent out of order.
   coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](Status s) {
+                                 [&](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
     TF_ASSERT_OK(s);
     // Gather the cluster device info.
     cluster_devices = coord_service->ListClusterDevices();
@@ -798,7 +803,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   CoordinatedTask task_1;
   task_1.set_job_name("worker");
   task_1.set_task_id(1);
-  Status status = OkStatus();
+  absl::Status status = absl::OkStatus();
   auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationServiceInterface> coord_service =
       CoordinationServiceInterface::EnableCoordinationService(
@@ -816,19 +821,20 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   // Task0 sends device info.
   DeviceInfo cluster_devices;
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](Status s) { TF_ASSERT_OK(s); });
+                                 [](absl::Status s) { TF_ASSERT_OK(s); });
 
   // Task0 sends device info sgain.
   coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](Status s) { TF_ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(
-      task_1, local_devices_1,
-      [coord_service = coord_service.get(), &cluster_devices, &n](Status s) {
-        TF_ASSERT_OK(s);
-        // Gather the cluster device info.
-        cluster_devices = coord_service->ListClusterDevices();
-        n.Notify();
-      });
+                                 [](absl::Status s) { TF_ASSERT_OK(s); });
+  coord_service->WaitForAllTasks(task_1, local_devices_1,
+                                 [coord_service = coord_service.get(),
+                                  &cluster_devices, &n](absl::Status s) {
+                                   TF_ASSERT_OK(s);
+                                   // Gather the cluster device info.
+                                   cluster_devices =
+                                       coord_service->ListClusterDevices();
+                                   n.Notify();
+                                 });
   n.WaitForNotification();
 
   // No duplicates found.
@@ -844,37 +850,37 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
 TEST_F(CoordinationBarrierTest, Barrier) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
   absl::Notification n_0;
   absl::Notification n_1;
   absl::Notification n_2;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(1),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_1, &n_1](Status s) {
-                                           barrier_status_1 = s;
-                                           n_1.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(1),
+      /*participating_tasks=*/{}, [&barrier_status_1, &n_1](absl::Status s) {
+        barrier_status_1 = s;
+        n_1.Notify();
+      });
   // Make sure barrier has not been exited prematurely.
   EXPECT_FALSE(n_0.HasBeenNotified());
   EXPECT_FALSE(n_1.HasBeenNotified());
   EXPECT_FALSE(n_2.HasBeenNotified());
 
   // Last task calls the barrier.
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(2),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_2, &n_2](Status s) {
-                                           barrier_status_2 = s;
-                                           n_2.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(2),
+      /*participating_tasks=*/{}, [&barrier_status_2, &n_2](absl::Status s) {
+        barrier_status_2 = s;
+        n_2.Notify();
+      });
 
   EXPECT_TRUE(n_0.HasBeenNotified());
   EXPECT_TRUE(n_1.HasBeenNotified());
@@ -887,22 +893,22 @@ TEST_F(CoordinationBarrierTest, Barrier) {
 TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
   absl::Notification n_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1, &n_1](Status s) {
+      [&barrier_status_1, &n_1](absl::Status s) {
         barrier_status_1 = s;
         n_1.Notify();
       });
@@ -917,19 +923,19 @@ TEST_F(CoordinationBarrierTest, BarrierWithSubsetOfTasks) {
 TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   // task_1's barrier call specified a conflicting set of tasks (task_2 instead
   // of task_0).
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(1), GetTask(2)},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_1));
@@ -938,20 +944,20 @@ TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
 TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
   absl::Notification n_1;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   // Task 2 unexpectedly calls a barrier that it is not participating in.
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(2),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   // Barrier should fail for all tasks with the unexpected call.
   EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
@@ -961,7 +967,7 @@ TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
 TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
+  absl::Status barrier_status_0;
   absl::Notification n_0;
   CoordinatedTask unspecified_task;
   unspecified_task.set_job_name("task_from_another_cluster");
@@ -970,7 +976,7 @@ TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), unspecified_task},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
@@ -983,15 +989,15 @@ TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
 TEST_F(CoordinationBarrierTest, BarrierTimeout) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status_0;
+  absl::Status barrier_status_0;
   absl::Notification n_0;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
 
   // Block until user-specified timeout.
   n_0.WaitForNotification();
@@ -1007,16 +1013,16 @@ TEST_F(CoordinationBarrierTest, BarrierTimeout) {
 TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status_0;
-  Status barrier_status_1;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
   absl::Notification n_0;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n_0](Status s) {
-                                           barrier_status_0 = s;
-                                           n_0.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n_0](absl::Status s) {
+        barrier_status_0 = s;
+        n_0.Notify();
+      });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
   // Block until barrier has failed due to task error.
@@ -1025,7 +1031,7 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
 
   EXPECT_TRUE(absl::IsInternal(barrier_status_0));
   EXPECT_TRUE(absl::IsInternal(barrier_status_1));
@@ -1034,13 +1040,13 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
 TEST_F(CoordinationBarrierTest, BarrierCancelled) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status;
+  absl::Status barrier_status;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
-  Status cancelled_status =
+      [&barrier_status](absl::Status s) { barrier_status = s; });
+  absl::Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsCancelled(barrier_status));
@@ -1050,7 +1056,7 @@ TEST_F(CoordinationBarrierTest, BarrierCancelled) {
 TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
   const std::string barrier_id = "cancelled_barrier_id";
   absl::Duration timeout = absl::Seconds(1);
-  Status barrier_status;
+  absl::Status barrier_status;
 
   // Cancel barrier should still succeed.
   TF_ASSERT_OK(GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0)));
@@ -1058,7 +1064,7 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
+      [&barrier_status](absl::Status s) { barrier_status = s; });
 
   EXPECT_TRUE(absl::IsCancelled(barrier_status)) << barrier_status;
 }
@@ -1066,24 +1072,24 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
 TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{},
-      [&barrier_status_0](Status s) { barrier_status_0 = s; });
+      [&barrier_status_0](absl::Status s) { barrier_status_0 = s; });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_1](Status s) { barrier_status_1 = s; });
+      [&barrier_status_1](absl::Status s) { barrier_status_1 = s; });
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(2),
       /*participating_tasks=*/{},
-      [&barrier_status_2](Status s) { barrier_status_2 = s; });
+      [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
   // Cancel barrier should fail if barrier has already been passed.
-  Status cancelled_status =
+  absl::Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
   EXPECT_TRUE(absl::IsFailedPrecondition(cancelled_status));
@@ -1095,38 +1101,38 @@ TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
 TEST_F(CoordinationBarrierTest, PassedBarrierReturnsImmediately) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
-  Status barrier_status_repeat;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
+  absl::Status barrier_status_repeat;
   absl::Notification n0;
   absl::Notification n1;
   absl::Notification n2;
   absl::Notification n_repeat;
 
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_0, &n0](Status s) {
-                                           barrier_status_0 = s;
-                                           n0.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(1),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_1, &n1](Status s) {
-                                           barrier_status_1 = s;
-                                           n1.Notify();
-                                         });
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(2),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status_2, &n2](Status s) {
-                                           barrier_status_2 = s;
-                                           n2.Notify();
-                                         });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status_0, &n0](absl::Status s) {
+        barrier_status_0 = s;
+        n0.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(1),
+      /*participating_tasks=*/{}, [&barrier_status_1, &n1](absl::Status s) {
+        barrier_status_1 = s;
+        n1.Notify();
+      });
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(2),
+      /*participating_tasks=*/{}, [&barrier_status_2, &n2](absl::Status s) {
+        barrier_status_2 = s;
+        n2.Notify();
+      });
   // Repeated call should return the same result.
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status_repeat, &n_repeat](Status s) {
+      [&barrier_status_repeat, &n_repeat](absl::Status s) {
         barrier_status_repeat = s;
         n_repeat.Notify();
       });
@@ -1147,12 +1153,12 @@ TEST_F(CoordinationBarrierTest, BarrierFailsIfTaskIsAlreadyInError) {
   // Set task 0 to error state.
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{},
-      [&barrier_status](Status s) { barrier_status = s; });
+      [&barrier_status](absl::Status s) { barrier_status = s; });
 
   EXPECT_TRUE(absl::IsInternal(barrier_status));
 }
@@ -1161,14 +1167,14 @@ TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
   absl::Notification n0;
-  Status barrier_status;
-
-  GetCoordinationService()->BarrierAsync(barrier_id, timeout, GetTask(0),
-                                         /*participating_tasks=*/{},
-                                         [&barrier_status, &n0](Status s) {
-                                           barrier_status = s;
-                                           n0.Notify();
-                                         });
+  absl::Status barrier_status;
+
+  GetCoordinationService()->BarrierAsync(
+      barrier_id, timeout, GetTask(0),
+      /*participating_tasks=*/{}, [&barrier_status, &n0](absl::Status s) {
+        barrier_status = s;
+        n0.Notify();
+      });
   TF_ASSERT_OK(GetCoordinationService()->ReportTaskError(
       GetTask(0), errors::Internal("test_error")));
   n0.WaitForNotification();
@@ -1180,9 +1186,9 @@ TEST_F(CoordinationBarrierTest,
        BarrierStillBlocksIfSameTaskCallsOngoingBarrierRepeatedly) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
-  Status barrier_status_0;
-  Status barrier_status_1;
-  Status barrier_status_2;
+  absl::Status barrier_status_0;
+  absl::Status barrier_status_1;
+  absl::Status barrier_status_2;
   absl::Notification n_0;
   absl::Notification n_1;
   absl::Notification n_2;
@@ -1190,7 +1196,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_0, &n_0](Status s) {
+      [&barrier_status_0, &n_0](absl::Status s) {
         barrier_status_0 = s;
         n_0.Notify();
       });
@@ -1198,7 +1204,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(0),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_1, &n_1](Status s) {
+      [&barrier_status_1, &n_1](absl::Status s) {
         barrier_status_1 = s;
         n_1.Notify();
       });
@@ -1209,7 +1215,7 @@ TEST_F(CoordinationBarrierTest,
   GetCoordinationService()->BarrierAsync(
       barrier_id, timeout, GetTask(1),
       /*participating_tasks=*/{GetTask(0), GetTask(1)},
-      [&barrier_status_2, &n_2](Status s) {
+      [&barrier_status_2, &n_2](absl::Status s) {
         barrier_status_2 = s;
         n_2.Notify();
       });
@@ -1248,14 +1254,15 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  Status barrier_status;
+  absl::Status barrier_status;
   absl::Notification barrier_n;
-  coord_service_->BarrierAsync(
-      "ongoing_barrier", absl::InfiniteDuration(), task_0_,
-      /*participating_tasks=*/{}, [&barrier_status, &barrier_n](Status s) {
-        barrier_status = s;
-        barrier_n.Notify();
-      });
+  coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
+                               task_0_,
+                               /*participating_tasks=*/{},
+                               [&barrier_status, &barrier_n](absl::Status s) {
+                                 barrier_status = s;
+                                 barrier_n.Notify();
+                               });
 
   TF_EXPECT_OK(coord_service_->ResetTask(task_0_));
 
@@ -1270,7 +1277,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n](Status s) {
+  coord_service_->ShutdownTaskAsync(task_0_, [&n](absl::Status s) {
     TF_EXPECT_OK(s);
     n.Notify();
   });
@@ -1291,17 +1298,18 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
   EnableCoordinationService(/*has_service_to_client_connection=*/true,
                             /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  Status barrier_status;
+  absl::Status barrier_status;
   absl::Notification barrier_n;
-  coord_service_->BarrierAsync(
-      "ongoing_barrier", absl::InfiniteDuration(), task_0_,
-      /*participating_tasks=*/{}, [&barrier_status, &barrier_n](Status s) {
-        barrier_status = s;
-        barrier_n.Notify();
-      });
+  coord_service_->BarrierAsync("ongoing_barrier", absl::InfiniteDuration(),
+                               task_0_,
+                               /*participating_tasks=*/{},
+                               [&barrier_status, &barrier_n](absl::Status s) {
+                                 barrier_status = s;
+                                 barrier_n.Notify();
+                               });
 
   absl::Notification shutdown_n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&shutdown_n](Status s) {
+  coord_service_->ShutdownTaskAsync(task_0_, [&shutdown_n](absl::Status s) {
     TF_EXPECT_OK(s);
     shutdown_n.Notify();
   });
@@ -1317,13 +1325,13 @@ TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
-  Status barrier_status_2;
+  absl::Status barrier_status;
+  absl::Status barrier_status_2;
 
   coord_service_->ShutdownTaskAsync(
-      task_0_, [&barrier_status](Status s) { barrier_status = s; });
+      task_0_, [&barrier_status](absl::Status s) { barrier_status = s; });
   coord_service_->ShutdownTaskAsync(
-      task_1_, [&barrier_status_2](Status s) { barrier_status_2 = s; });
+      task_1_, [&barrier_status_2](absl::Status s) { barrier_status_2 = s; });
 
   TF_EXPECT_OK(barrier_status);
   TF_EXPECT_OK(barrier_status_2);
@@ -1341,13 +1349,14 @@ TEST_F(CoordinateTwoTasksTest,
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n, &barrier_status](Status s) {
-    barrier_status = s;
-    n.Notify();
-  });
+  coord_service_->ShutdownTaskAsync(task_0_,
+                                    [&n, &barrier_status](absl::Status s) {
+                                      barrier_status = s;
+                                      n.Notify();
+                                    });
   // Block until barrier times out.
   n.WaitForNotification();
 
@@ -1358,7 +1367,7 @@ TEST_F(CoordinateTwoTasksTest,
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   // Other task is alerted that shutdown has been initiated without it.
-  Status other_task_status = client_1_.GetStatus();
+  absl::Status other_task_status = client_1_.GetStatus();
   EXPECT_TRUE(absl::IsInternal(other_task_status)) << other_task_status;
 }
 
@@ -1368,13 +1377,14 @@ TEST_F(CoordinateTwoTasksTest,
                             /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  Status barrier_status;
+  absl::Status barrier_status;
 
   absl::Notification n;
-  coord_service_->ShutdownTaskAsync(task_0_, [&n, &barrier_status](Status s) {
-    barrier_status = s;
-    n.Notify();
-  });
+  coord_service_->ShutdownTaskAsync(task_0_,
+                                    [&n, &barrier_status](absl::Status s) {
+                                      barrier_status = s;
+                                      n.Notify();
+                                    });
   // Block until barrier times out.
   n.WaitForNotification();
   // Provide time for coordination service to shut down after barrier timeout.
@@ -1387,7 +1397,7 @@ TEST_F(CoordinateTwoTasksTest,
   // error propagation.
   // Task 1 still sends unexpected heartbeat because it doesn't know that
   // service has stopped yet, which should fail.
-  Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
+  absl::Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
 
   EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 }
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
index aca10c8ca69783..4acb411bf32f00 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
@@ -1,7 +1,7 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
index 9effe4beb3e2e8..205029043be73f 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -95,8 +95,8 @@ cc_library(
         "//tsl/platform:thread_annotations",
         "//tsl/platform:types",
         "//tsl/protobuf:rpc_options_proto_cc",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -114,7 +114,7 @@ tsl_cc_test(
         "//tsl/platform:test",
         "//tsl/platform:test_main",
         "//tsl/protobuf:rpc_options_proto_cc_impl",
-        "//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
@@ -129,8 +129,8 @@ cc_library(
         "//tsl/platform:errors",
         "//tsl/platform:status",
         "//tsl/platform:strcat",
-        "//tsl/util:env_var",
         "@com_google_absl//absl/status",
+        "@local_xla//xla/tsl/util:env_var",
     ] + tsl_grpc_cc_dependencies(),
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
index ba8ec2333d197d..7eb439e018a4bd 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
index 492c984e12f13a..ba12449f03bf2f 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tsl/platform/thread_annotations.h"
 #include "tsl/platform/types.h"
 #include "tsl/protobuf/rpc_options.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index 6b2d330cb1d57a..adc0df2b89ddef 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/test.h"
 #include "tsl/protobuf/rpc_options.pb.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 #define IsSameAddrSp DeviceNameUtils::IsSameAddressSpace
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
index 893e1b0192f694..21d8f2df5099e3 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/grpc_state.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
 #include "absl/status/status.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/distributed_runtime/call_options.h"
 #include "tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tsl/distributed_runtime/rpc/grpc_util.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/strcat.h"
 #include "tsl/platform/threadpool.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index a71b67464e511f..bca669a616f0ec 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -4,6 +4,10 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
@@ -14,10 +18,6 @@ load(
     "//tsl/platform:build_config_root.bzl",
     "if_static",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -273,10 +273,10 @@ cc_library(
         "//tsl/platform:status",
         "//tsl/platform:statusor",
         "//tsl/platform:str_util",
-        "//tsl/util:device_name_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
@@ -465,7 +465,7 @@ tsl_cc_test(
         "//tsl/platform:status_matchers",
         "//tsl/platform:test_main",
         "//tsl/protobuf:error_codes_proto_impl_cc",
-        "//tsl/util:device_name_utils",
+        "@local_xla//xla/tsl/util:device_name_utils",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
index e265644dce3d79..47bf5f20aa81dc 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
@@ -1,7 +1,7 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
index c2479aded5fe0a..e814e68c8530a8 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils.h
@@ -20,11 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/framework/device_id.h"
 #include "tsl/framework/device_type.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
index ddf7cdd479935b..21e574f95c1b2c 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/framework/device_id_utils_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "xla/tsl/util/device_name_utils.h"
 #include "tsl/framework/device_id_manager.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
-#include "tsl/util/device_name_utils.h"
 
 namespace tsl {
 namespace {
diff --git a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
index 310080f5285a60..5818820ad654be 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index eb5bbda819c7c3..c5b5830535dd0e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -4,13 +4,13 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 # TODO(rdzhabarov): Tighten visibility after migration is complete.
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
index 306360d7cd6b16..8a6185374c92bf 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
index 20b335bb0110c0..2145beac6e8dda 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
@@ -1,3 +1,7 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tsl:tsl.bzl",
     "if_linux_x86_64",
@@ -9,10 +13,6 @@ load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
index 0093ad1b5274da..3f0565eaf9ae63 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
@@ -1,13 +1,13 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
index 5b45d10a620a1e..9d6e02ec835ef1 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
@@ -1,10 +1,10 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
index 9313ca591757ea..e1608ce3ec2b9b 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/random_inputstream.h
@@ -31,7 +31,7 @@ class RandomAccessInputStream : public InputStreamInterface {
   // must outlive *this.
   RandomAccessInputStream(RandomAccessFile* file, bool owns_file = false);
 
-  ~RandomAccessInputStream();
+  ~RandomAccessInputStream() override;
 
   Status ReadNBytes(int64_t bytes_to_read, tstring* result) override;
 
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
index e19843c812d350..d009cbfd0baf54 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_inputstream.h
@@ -55,7 +55,7 @@ class ZlibInputStream : public InputStreamInterface {
                   size_t output_buffer_bytes,
                   const ZlibCompressionOptions& zlib_options);
 
-  ~ZlibInputStream();
+  ~ZlibInputStream() override;
 
   // Reads bytes_to_read bytes into *result, overwriting *result.
   //
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
index 3e4236ac1e44fd..8f0793c985bae5 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/zlib_outputbuffer.h
@@ -49,7 +49,7 @@ class ZlibOutputBuffer : public WritableFile {
       int32_t output_buffer_bytes,  // size of z_stream.next_out buffer
       const ZlibCompressionOptions& zlib_options);
 
-  ~ZlibOutputBuffer();
+  ~ZlibOutputBuffer() override;
 
   // Initializes some state necessary for the output buffer. This call is
   // required before any other operation on the buffer.
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
index 5b739e5605c4a3..06af0dc45a5ac1 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
index 0dbd76ef3023ed..169d41b6f6c969 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
@@ -1,10 +1,10 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 default_visibility = [
     "//tsl/lib/io:__pkg__",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
index b19d298365ae15..6593d44eb6c087 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 182133edd69bb5..d2c3c68c783d09 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -4,6 +4,14 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
+load(
+    "@bazel_skylib//:bzl_library.bzl",
+    "bzl_library",
+)
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 load(
     "//tsl:tsl.bzl",
     "if_not_fuchsia",
@@ -29,14 +37,6 @@ load(
     "tsl_protobuf_deps",
 )
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
-load(
-    "@bazel_skylib//:bzl_library.bzl",
-    "bzl_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index e9588213ddf932..21e13663a4b4cf 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # Cloud file system implementation.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl:tsl.bzl",
     "if_windows",
@@ -8,7 +9,6 @@ load(
     "tsl_copts",
 )
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -217,8 +217,8 @@ cc_library(
         "//tsl/platform:str_util",
         "//tsl/platform:stringpiece",
         "//tsl/platform:types",
-        "//tsl/util:env_var",
         "@curl",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
index a7e6a65e37335d..c41f967c04b055 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/curl_http_request.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/lib/gtl/map_util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/scanner.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/env_var.h"
 
 #define CHECK_CURL_OK(expr) CHECK_EQ(expr, CURLE_OK)
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index 8e6a52db7d1854..bac3898fef57a5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -1,3 +1,6 @@
+# Tensorflow default + linux implementations of tensorflow/core/platform libraries.
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl:tsl.bzl",
     "if_not_fuchsia",
@@ -6,10 +9,6 @@ load(
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-# Tensorflow default + linux implementations of tensorflow/core/platform libraries.
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
index 3c35e4cd37aa57..d8ba2d02903691 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
@@ -1,6 +1,7 @@
 # Platform-specific build configurations.
 
 load("@com_github_grpc_grpc//bazel:generate_cc.bzl", "generate_cc")
+load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 load(
     "//tsl:tsl.bzl",
     "clean_dep",
@@ -8,7 +9,6 @@ load(
     "if_tsl_link_protobuf",
 )
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
 
 def well_known_proto_libs():
     """Set of standard protobuf protos, like Any and Timestamp.
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
index ed2ffece58819b..46321e74b5dc38 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_libdevice_path.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tsl/platform/path.h"
 #include "tsl/platform/platform.h"
 
 #if defined(PLATFORM_POSIX) && !defined(__APPLE__)
@@ -44,15 +45,14 @@ std::vector<std::string> CandidateCudaRoots() {
   Dl_info info;
 
   if (dladdr(&__FUNCTION__, &info)) {
-    auto lib = std::vector<char>{info.dli_fname,
-                                 info.dli_fname + strlen(info.dli_fname)};
-    auto dir = dirname(lib.data());
+    auto lib = std::string(info.dli_fname);
+    auto dir = io::Dirname(lib);
 
     // TF lib binaries are located in both the package's root dir and within a
     // 'python' subdirectory (for pywrap libs). So we check two possible paths
     // relative to the current binary for the wheel-based nvcc package.
-    for (auto path : {"/../nvidia/cuda_nvcc", "/../../nvidia/cuda_nvcc"})
-      roots.emplace_back(std::string(dir) + path);
+    for (auto path : {"../nvidia/cuda_nvcc", "../../nvidia/cuda_nvcc"})
+      roots.emplace_back(io::JoinPath(dir, path));
   }
 #endif  // defined(PLATFORM_POSIX) && !defined(__APPLE__)
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
index 054460a83e19a0..60b1bdfa5ec130 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/rules_cc.bzl
@@ -2,12 +2,29 @@
 
 _cc_binary = native.cc_binary
 _cc_import = native.cc_import
-_cc_library = native.cc_library
 _cc_shared_library = native.cc_shared_library
 _cc_test = native.cc_test
 
 cc_binary = _cc_binary
 cc_import = _cc_import
-cc_library = _cc_library
 cc_shared_library = _cc_shared_library
 cc_test = _cc_test
+
+def cc_library(name, deps = None, **kwargs):
+    """cc_library that hides side effects of https://github.com/bazelbuild/bazel/issues/21519.
+
+    Args:
+      name: name of target.
+      deps: deps with `xla:bazel_issue_21519` added.
+      **kwargs: passed to native.cc_library.
+    """
+
+    if deps == None:
+        deps = []
+
+    # Horrifying, but needed to prevent a cycle, as `bazel_issue_21519` is an
+    # alias of `empty`.
+    if name != "empty":
+        deps = deps + ["@local_xla//xla:bazel_issue_21519"]  # buildifier: disable=list-append
+
+    native.cc_library(name = name, deps = deps, **kwargs)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/env.h b/third_party/xla/third_party/tsl/tsl/platform/env.h
index fe3354c765a06f..35b446a99445a5 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/env.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/env.h
@@ -54,8 +54,12 @@ struct ThreadOptions;
 /// Callers may wish to provide a custom Env object to get fine grain
 /// control.
 ///
-/// All Env implementations are safe for concurrent access from
-/// multiple threads without any external synchronization.
+/// All Env implementations of file-system modifying functionality are safe
+/// for concurrent access from multiple threads without any external
+/// synchronization, *however*, Envs and their underlying file systems are
+/// global objects, and therefore, if any thread modifies options, the modified
+/// options take effect process-wide. The SetOption functions themselves are
+/// also *not* thread safe.
 class Env {
  public:
   Env();
diff --git a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
index c25efc2f865b70..504085af8518ee 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/ml_dtypes.h
@@ -20,16 +20,16 @@ limitations under the License.
 #include "ml_dtypes/include/int4.h"  // from @ml_dtypes
 
 namespace tsl {
-using float8_e4m3fn = ml_dtypes::float8_e4m3fn;
-using float8_e4m3fnuz = ml_dtypes::float8_e4m3fnuz;
-using float8_e4m3b11fnuz = ml_dtypes::float8_e4m3b11fnuz;
+using float8_e4m3fn = ::ml_dtypes::float8_e4m3fn;
+using float8_e4m3fnuz = ::ml_dtypes::float8_e4m3fnuz;
+using float8_e4m3b11fnuz = ::ml_dtypes::float8_e4m3b11fnuz;
 using float8_e4m3b11 = float8_e4m3b11fnuz;  // Deprecated: old name for
                                             // backward-compatibility only.
-using float8_e5m2 = ml_dtypes::float8_e5m2;
-using float8_e5m2fnuz = ml_dtypes::float8_e5m2fnuz;
+using float8_e5m2 = ::ml_dtypes::float8_e5m2;
+using float8_e5m2fnuz = ::ml_dtypes::float8_e5m2fnuz;
 
-using int4 = ml_dtypes::int4;
-using uint4 = ml_dtypes::uint4;
+using int4 = ::ml_dtypes::int4;
+using uint4 = ::ml_dtypes::uint4;
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_ML_DTYPES_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/path.cc b/third_party/xla/third_party/tsl/tsl/platform/path.cc
index b33af3eb7c311a..580aacde900c1a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/path.cc
@@ -407,5 +407,12 @@ bool ResolveTestPrefixes(tsl::StringPiece path, string& resolved_path) {
   }
 }
 
+[[maybe_unused]] std::string& AppendDotExeIfWindows(std::string& path) {
+#ifdef PLATFORM_WINDOWS
+  path.append(".exe");
+#endif  // PLATFORM_WINDOWS
+  return path;
+}
+
 }  // namespace io
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/path.h b/third_party/xla/third_party/tsl/tsl/platform/path.h
index 451addc60b465c..f0a5b87d135c2a 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/path.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/path.h
@@ -126,6 +126,9 @@ bool GetTestUndeclaredOutputsDir(std::string* dir);
 // be resolved.
 bool ResolveTestPrefixes(tsl::StringPiece path, std::string& resolved_path);
 
+// Appends `.exe` if `PLATFORM_WINDOWS` is defined.
+[[maybe_unused]] std::string& AppendDotExeIfWindows(std::string& path);
+
 }  // namespace io
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
index 1421f3c7e143f4..5285e49bcb1df3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
@@ -1,16 +1,12 @@
 # Description:
 # profile_utils targets.
 
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load(
-    "//tsl:tsl.bzl",
-    "tsl_copts",
-)
+load("//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
+load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
index 791524f6b00a72..e6180168662250 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/protobuf.h
@@ -125,6 +125,12 @@ class TStringOutputStream : public protobuf::io::ZeroCopyOutputStream {
 
   tstring* target_;
 };
+
+std::string LegacyUnredactedDebugString(const tsl::protobuf::Message& message);
+std::string LegacyUnredactedDebugString(
+    const tsl::protobuf::MessageLite& message);
+std::string LegacyUnredactedShortDebugString(
+    const tsl::protobuf::Message& message);
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_PROTOBUF_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc b/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
index 837bf112ed0ec6..0bab6131327970 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/protobuf_util.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "tsl/platform/protobuf.h"
 
 namespace tsl {
@@ -27,4 +29,32 @@ bool ParseProtoUnlimited(protobuf::MessageLite* proto, const void* serialized,
   return proto->ParseFromArray(serialized, size);
 }
 
+std::string LegacyUnredactedDebugString(const tsl::protobuf::Message& message) {
+  std::string debug_string;
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetExpandAny(true);
+
+  printer.PrintToString(message, &debug_string);
+  return debug_string;
+}
+
+std::string LegacyUnredactedDebugString(
+    const tsl::protobuf::MessageLite& message) {
+  return message.DebugString();
+}
+
+std::string LegacyUnredactedShortDebugString(
+    const tsl::protobuf::Message& message) {
+  std::string debug_string;
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+  printer.SetExpandAny(true);
+
+  printer.PrintToString(message, &debug_string);
+  if (!debug_string.empty() && debug_string.back() == ' ') {
+    debug_string.pop_back();
+  }
+  return debug_string;
+}
+
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc b/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
index 4e36b0d790da2f..1b1bbcb3113e17 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/subprocess_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tsl/platform/subprocess.h"
 
 #include <stdlib.h>
-#include <string.h>
 
 #include <algorithm>
+#include <string>
 
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/path.h"
@@ -36,48 +36,43 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-static string GetDataFilePath(const string& relative_path) {
-#ifdef PLATFORM_WINDOWS
-  // While CreateProcess on windows is resilient to not having ".exe" suffix,
-  // Bazel_tools has to have the exact file path to return the resource.
-  return strings::StrCat(relative_path, ".exe");
-#else
-  return relative_path;
-#endif
-}
 
 string EchoProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_echo");
+  std::string path =
+      io::JoinPath(testing::TslSrcRoot(), "platform", "testdata", "test_echo");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string EchoArgv1Program() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_echo_argv_1");
+  std::string path = io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                                  "test_echo_argv_1");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string NoopProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_noop");
+  std::string path =
+      io::JoinPath(testing::TslSrcRoot(), "platform", "testdata", "test_noop");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 string StdErrProgram() {
-  return io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
-                      "test_stderr");
+  std::string path = io::JoinPath(testing::TslSrcRoot(), "platform", "testdata",
+                                  "test_stderr");
+  return tsl::io::AppendDotExeIfWindows(path);
 }
 
 class SubProcessTest : public ::testing::Test {};
 
 TEST_F(SubProcessTest, NoOutputNoComm) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   EXPECT_TRUE(proc.Start());
   EXPECT_TRUE(proc.Wait());
 }
 
 TEST_F(SubProcessTest, NoOutput) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -93,7 +88,7 @@ TEST_F(SubProcessTest, NoOutput) {
 TEST_F(SubProcessTest, Stdout) {
   tsl::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+  proc.SetProgram(EchoArgv1Program().c_str(),
                   {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
@@ -110,7 +105,7 @@ TEST_F(SubProcessTest, Stdout) {
 TEST_F(SubProcessTest, StdoutIgnored) {
   tsl::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+  proc.SetProgram(EchoArgv1Program().c_str(),
                   {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
@@ -124,8 +119,7 @@ TEST_F(SubProcessTest, StdoutIgnored) {
 TEST_F(SubProcessTest, Stderr) {
   tsl::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
-                  {StdErrProgram(), test_string});
+  proc.SetProgram(StdErrProgram().c_str(), {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -141,8 +135,7 @@ TEST_F(SubProcessTest, Stderr) {
 TEST_F(SubProcessTest, StderrIgnored) {
   tsl::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
-                  {StdErrProgram(), test_string});
+  proc.SetProgram(StdErrProgram().c_str(), {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -154,7 +147,7 @@ TEST_F(SubProcessTest, StderrIgnored) {
 
 TEST_F(SubProcessTest, Stdin) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -166,7 +159,7 @@ TEST_F(SubProcessTest, Stdin) {
 
 TEST_F(SubProcessTest, StdinStdout) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -183,7 +176,7 @@ TEST_F(SubProcessTest, StdinStdout) {
 
 TEST_F(SubProcessTest, StdinChildExit) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
+  proc.SetProgram(NoopProgram().c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -202,7 +195,7 @@ TEST_F(SubProcessTest, StdinChildExit) {
 
 TEST_F(SubProcessTest, StdinStdoutOverlap) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -226,7 +219,7 @@ TEST_F(SubProcessTest, StdinStdoutOverlap) {
 
 TEST_F(SubProcessTest, KillProc) {
   tsl::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
+  proc.SetProgram(EchoProgram().c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
index fda40c1e211f02..c8f4ed0d14b6fb 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
@@ -1,3 +1,8 @@
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
+
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
     "//tsl:tsl.bzl",
@@ -5,10 +10,6 @@ load(
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup")
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
index c2dc119d6bb8c9..7c7d9e7f036ab5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
@@ -1,8 +1,8 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
index 22df6844da1a08..72e2e53537794d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/build_config.bzl
@@ -1,11 +1,11 @@
 """Provides a redirection point for platform specific implementations of Starlark utilities."""
 
+load("//tsl:tsl.bzl", "clean_dep")
 load(
     "//tsl/profiler/builds/oss:build_config.bzl",
     _tf_profiler_alias = "tf_profiler_alias",
     _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
 )
-load("//tsl:tsl.bzl", "clean_dep")
 
 tf_profiler_pybind_cc_library_wrapper = _tf_profiler_pybind_cc_library_wrapper
 tf_profiler_alias = _tf_profiler_alias
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
index 12273bf96f1964..e6ab8e6b5ab874 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
@@ -1,9 +1,9 @@
-load("//tsl:tsl.bzl", "internal_visibility")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index b598210bff3c36..84f6a1439711aa 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -1,8 +1,8 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "if_not_android", "internal_visibility", "nvtx_headers")
 load("//tsl:tsl.default.bzl", "filegroup")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -31,6 +31,8 @@ cc_library(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "nvtx_utils.h",
+        "nvtx_utils_stub.cc",  #Include the stub implementation here since CUDA isn't relevant to Android.
         "scoped_annotation.h",
         "scoped_memory_debug_annotation.cc",
         "scoped_memory_debug_annotation.h",
@@ -147,8 +149,8 @@ cc_library(
         "//tsl/platform:errors",
         "//tsl/platform:macros",
         "//tsl/platform:statusor",
-        "//tsl/util:env_var",
         "@com_google_absl//absl/strings:string_view",
+        "@local_xla//xla/tsl/util:env_var",
     ],
 )
 
@@ -269,27 +271,28 @@ cc_library(
 
 cc_library(
     name = "nvtx_utils",
+    srcs = if_cuda_is_configured(
+        ["nvtx_utils.cc"],
+        ["nvtx_utils_stub.cc"],
+    ),
     hdrs = ["nvtx_utils.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     visibility = ["//visibility:public"],
-    deps = [
-        "//tsl/platform:logging",
-        "//tsl/platform:macros",
-        "//tsl/platform:types",
-        "@com_google_absl//absl/strings",
-    ] + if_cuda_is_configured(nvtx_headers()),
+    deps = if_cuda_is_configured(nvtx_headers()),
 )
 
 cc_library(
     name = "scoped_annotation",
-    hdrs = ["scoped_annotation.h"],
+    hdrs = [
+        "nvtx_utils.h",
+        "scoped_annotation.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
+        ":nvtx_utils",
         "//tsl/platform:macros",
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
     ] + if_not_android([
-        ":nvtx_utils",
         "//tsl/profiler/backends/cpu:annotation_stack",
     ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
index e6e5bfed1493cc..a4b01ae517f650 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/connected_traceme.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
 #define TENSORFLOW_TSL_PROFILER_LIB_CONNECTED_TRACEME_H_
 
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -79,7 +80,7 @@ class TraceMeProducer : public TraceMe {
   template <typename NameT>
   explicit TraceMeProducer(NameT&& name,
                            ContextType context_type = ContextType::kGeneric,
-                           absl::optional<uint64> context_id = absl::nullopt,
+                           std::optional<uint64> context_id = std::nullopt,
                            int level = 2)
       : TraceMe(std::forward<NameT>(name), level),
         context_id_(context_id.has_value() ? context_id.value()
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc
new file mode 100644
index 00000000000000..b122c6e12dfc19
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.cc
@@ -0,0 +1,84 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include "nvtx3/nvToolsExt.h"
+#include "nvtx3/nvToolsExtPayload.h"
+
+namespace tsl::profiler {
+static_assert(std::is_pointer_v<nvtxDomainHandle_t>);
+static_assert(std::is_pointer_v<nvtxStringHandle_t>);
+
+ProfilerDomainHandle DefaultProfilerDomain() {
+  static ProfilerDomainHandle domain =
+      reinterpret_cast<ProfilerDomainHandle>(nvtxDomainCreateA("TSL"));
+  return domain;
+}
+
+void RangePop(ProfilerDomainHandle domain) {
+  nvtxDomainRangePop(reinterpret_cast<nvtxDomainHandle_t>(domain));
+}
+
+void RangePush(ProfilerDomainHandle domain, const char* ascii) {
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  attrs.message.ascii = ascii;
+  nvtxDomainRangePushEx(reinterpret_cast<nvtxDomainHandle_t>(domain), &attrs);
+}
+
+namespace detail {
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               uint64_t schema_id, const void* payload, size_t payload_size) {
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+  attrs.message.registered = reinterpret_cast<nvtxStringHandle_t>(title);
+  NVTX_PAYLOAD_EVTATTR_SET(attrs, schema_id, payload, payload_size);
+  nvtxDomainRangePushEx(reinterpret_cast<nvtxDomainHandle_t>(domain), &attrs);
+}
+}  // namespace detail
+
+uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr) {
+  return nvtxPayloadSchemaRegister(
+      reinterpret_cast<nvtxDomainHandle_t>(domain),
+      static_cast<const nvtxPayloadSchemaAttr_t*>(schemaAttr));
+}
+
+StringHandle RegisterString(ProfilerDomainHandle domain,
+                            const std::string& str) {
+  const auto impl = [domain](const char* c_str) {
+    return reinterpret_cast<StringHandle>(nvtxDomainRegisterStringA(
+        reinterpret_cast<nvtxDomainHandle_t>(domain), c_str));
+  };
+  constexpr auto max_length = 65330;
+  if (str.size() <= max_length) {
+    return impl(str.c_str());
+  }
+  // nvbugs 4340868
+  std::string_view suffix{"\n[truncated]\n"};
+  std::string buffer(str.data(), max_length - suffix.size());
+  buffer.append(suffix);
+  return impl(buffer.c_str());
+}
+}  // namespace tsl::profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
index 8713550b3a20c8..0727072d06390d 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
@@ -16,73 +16,54 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
 #define TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
 
-#include <optional>
+#include <stddef.h>
+
+#include <cstdint>
 #include <string>
 
-#if GOOGLE_CUDA
-#include "nvtx3/nvToolsExt.h"
-#include "nvtx3/nvToolsExtPayload.h"
-#else
-// Some typedef to help build without NVTX.
-typedef void* nvtxDomainHandle_t;
-typedef void* nvtxStringHandle_t;
-#endif
+namespace tsl::profiler {
+struct String;
+// Opaque handle to a string that has been pre-registered with the profiler/NVTX
+// implementation
+using StringHandle = String*;
 
-namespace tsl {
-namespace profiler {
+struct ProfilerDomain;
+// Opaque handle to a domain in the profiler/NVTX implementation
+using ProfilerDomainHandle = ProfilerDomain*;
 
-// A helper function that return the domains to use if NVTX profiling
-// is enabled.
-inline std::optional<nvtxDomainHandle_t> GetNVTXDomain() {
-#if GOOGLE_CUDA
-  static nvtxDomainHandle_t domain = nvtxDomainCreateA("TSL");
-  if (domain != nullptr) return domain;
-#endif
-  return std::nullopt;
-}
+// Get the "TSL" domain if NVTX profiling is enabled, otherwise null
+ProfilerDomainHandle DefaultProfilerDomain();
 
-// A helper function to decide whether to enable CUDA NVTX profiling ranges.
-inline bool RangesEnabled() {
-#if GOOGLE_CUDA
-  return GetNVTXDomain().has_value();
-#else
-  return false;
-#endif
-}
+// Register a string with the profiler/NVTX implementation for faster use
+StringHandle RegisterString(ProfilerDomainHandle, const std::string&);
+
+// End a range that was created on this thread by RangePush
+void RangePop(ProfilerDomainHandle);
 
 // Older/simpler version; NVTX implementation copies a C-style string each time
-inline void RangePush(nvtxDomainHandle_t domain, const char* ascii) {
-#if GOOGLE_CUDA
-  nvtxEventAttributes_t attrs{};
-  attrs.version = NVTX_VERSION;
-  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  attrs.message.ascii = ascii;
-  ::nvtxDomainRangePushEx(domain, &attrs);
-#endif
-}
-inline void RangePush(nvtxDomainHandle_t domain, const std::string& str) {
+void RangePush(ProfilerDomainHandle domain, const char*);
+inline void RangePush(ProfilerDomainHandle domain, const std::string& str) {
   RangePush(domain, str.c_str());
 }
 
-// More powerful version: pass a registered string instead of a C-style string,
-// and attach a generic payload. The Annotation type must implement a method
-// called NvtxSchemaId() that allows the NVTX backend to interpret the payload.
+namespace detail {
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
+               uint64_t schema_id, const void* payload, size_t payload_size);
+}
+
+// More powerful version: pass a registered string instead of a C-style
+// string, and attach a generic payload. The Annotation type must implement a
+// method called NvtxSchemaId() that allows the NVTX backend to interpret the
+// payload.
 template <typename Annotation>
-void RangePush(nvtxDomainHandle_t domain, nvtxStringHandle_t handle,
+void RangePush(ProfilerDomainHandle domain, StringHandle title,
                const Annotation& annotation) {
-#if GOOGLE_CUDA
-  nvtxEventAttributes_t attrs{};
-  attrs.version = NVTX_VERSION;
-  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
-  attrs.message.registered = handle;
-  NVTX_PAYLOAD_EVTATTR_SET(attrs, annotation.NvtxSchemaId(), &annotation,
-                           sizeof(Annotation));
-  ::nvtxDomainRangePushEx(domain, &attrs);
-#endif
+  return detail::RangePush(domain, title, annotation.NvtxSchemaId(),
+                           &annotation, sizeof(Annotation));
 }
 
-}  // namespace profiler
-}  // namespace tsl
+// Register the schema of a custom payload type, for use with the more powerful
+// version of RangePush
+uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr);
+}  // namespace tsl::profiler
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc
new file mode 100644
index 00000000000000..c887af77ec8b11
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils_stub.cc
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tsl/profiler/lib/nvtx_utils.h"
+
+namespace tsl::profiler {
+ProfilerDomainHandle DefaultProfilerDomain() { return {}; }
+void RangePop(ProfilerDomainHandle) {}
+void RangePush(ProfilerDomainHandle, const char*) {}
+namespace detail {
+void RangePush(ProfilerDomainHandle, StringHandle, uint64_t, const void*,
+               size_t) {}
+}  // namespace detail
+uint64_t RegisterSchema(ProfilerDomainHandle, const void*) { return 0; }
+StringHandle RegisterString(ProfilerDomainHandle, const std::string&) {
+  return {};
+}
+}  // namespace tsl::profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
index e99db5ae366969..325713117a333a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_lock.cc
@@ -16,10 +16,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index b779d600959466..c41a2a39a8dc3a 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -23,25 +23,23 @@ limitations under the License.
 #include <utility>
 
 #include "tsl/platform/macros.h"
+#include "tsl/profiler/lib/nvtx_utils.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
 #endif
 
-#if GOOGLE_CUDA
-#include "tsl/profiler/lib/nvtx_utils.h"
-#endif
-
-namespace tsl {
-namespace profiler {
+namespace tsl::profiler {
 
 // Adds an annotation to all activities through the currently registered
 // TraceCollector until PopAnnotation() is called.
 template <typename T>
-inline void PushAnnotation(const T& generator) {
+void PushAnnotation(const T& generator) {
 #if GOOGLE_CUDA
-  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
-    return RangePush(*domain, generator());
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePush(domain, generator());
+    return;
   }
 #endif
 
@@ -65,8 +63,9 @@ inline void PopAnnotation() {
   std::atomic_thread_fence(std::memory_order_acquire);
 
 #if GOOGLE_CUDA
-  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
-    ::nvtxDomainRangePop(*domain);
+  if (auto domain = DefaultProfilerDomain();
+      TF_PREDICT_FALSE(domain != nullptr)) {
+    RangePop(domain);
     return;
   }
 #endif
@@ -110,7 +109,6 @@ class ScopedAnnotation {
   ScopedAnnotation& operator=(const ScopedAnnotation&) = delete;
 };
 
-}  // namespace profiler
-}  // namespace tsl
+}  // namespace tsl::profiler
 
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index f43bc94f52f585..428a963d2c31e3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,5 +1,5 @@
-# Placeholder: load py_proto_library
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
+# Placeholder: load py_proto_library
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tf_proto_library")
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
index debedf53094de4..d87ad885f05aab 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
index deb9383157a594..f9210aeaab557e 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
@@ -1,6 +1,6 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform:build_config.bzl",
     "tf_protos_profiler_service",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index 527723044ae302..41d669aedbd103 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -1,7 +1,7 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
@@ -213,12 +213,12 @@ cc_library(
         "//tsl/platform:types",
         "//tsl/profiler/lib:context_types",
         "//tsl/profiler/protobuf:xplane_proto_cc",
-        "//tsl/util:stats_calculator_portable",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@local_xla//xla/tsl/util:stats_calculator_portable",
     ],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
index 333b93743ae64f..88c7e30b76eee5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/stats_calculator.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/types.h"
 #include "tsl/profiler/lib/context_types.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_visitor.h"
-#include "tsl/util/stats_calculator.h"
 
 namespace tsl {
 namespace profiler {
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
index db185588785a46..557d9fcf2208e6 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
@@ -105,7 +105,7 @@ tf_proto_library(
     make_default_target_header_only = True,
     visibility = internal_visibility([
         "//tensorflow/core:__subpackages__",
-        "//tsl/util:__pkg__",
+        "@local_xla//xla/tsl/util:__pkg__",
     ]),
 )
 
diff --git a/third_party/xla/third_party/tsl/workspace0.bzl b/third_party/xla/third_party/tsl/workspace0.bzl
index 82419fece2d464..30a9426e5c6044 100644
--- a/third_party/xla/third_party/tsl/workspace0.bzl
+++ b/third_party/xla/third_party/tsl/workspace0.bzl
@@ -1,10 +1,10 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 
 def _tf_bind():
diff --git a/third_party/xla/third_party/tsl/workspace3.bzl b/third_party/xla/third_party/tsl/workspace3.bzl
index 2c13446fb4c1ab..9510b09374206c 100644
--- a/third_party/xla/third_party/tsl/workspace3.bzl
+++ b/third_party/xla/third_party/tsl/workspace3.bzl
@@ -1,8 +1,8 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 load("//third_party/llvm:workspace.bzl", llvm = "repo")
+load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     http_archive(
diff --git a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
index 5f2057a3c14360..4455aea60109fa 100644
--- a/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+++ b/third_party/xla/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
@@ -1,8 +1,8 @@
 """Configurations of AARCH64 builds used with Docker container."""
 
-load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+load("//tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
 
 def ml2014_tf_aarch64_configs(name_container_map, env):
     for name, container in name_container_map.items():
diff --git a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
index b1488584566aa6..18a84d96c39f82 100644
--- a/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/rbe_config.bzl
@@ -1,12 +1,12 @@
 """Macro that creates external repositories for remote config."""
 
-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
 load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
-load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
 load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
 load("//tools/toolchains/remote_config:containers.bzl", "containers")
-load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
 
 def _container_image_uri(container_name):
     container = containers[container_name]
diff --git a/third_party/xla/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
index cc23c8ecb22680..c7484d2ae2efdf 100644
--- a/third_party/xla/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
index 30571b6a5ace85..9ccc1706e5eca5 100644
--- a/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/tools/toolchains/win/bazel_211/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -29,7 +30,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
index f245f6d0789c9d..8a2ae6fe4a9dd3 100644
--- a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
@@ -15,8 +15,8 @@
 # This becomes the BUILD file for @local_config_cc// under Windows.
 
 load("@rules_cc//cc:defs.bzl", "cc_library", "cc_toolchain", "cc_toolchain_suite")
-load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 load(":armeabi_cc_toolchain_config.bzl", "armeabi_cc_toolchain_config")
+load(":windows_cc_toolchain_config.bzl", "cc_toolchain_config")
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl b/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
index ba3de607d10451..d6b966b32cecad 100644
--- a/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
+++ b/third_party/xla/tools/toolchains/win/tf_win_05022023/windows_cc_toolchain_config.bzl
@@ -14,6 +14,7 @@
 
 """A Starlark cc_toolchain configuration rule for Windows"""
 
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 load(
     "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
     "action_config",
@@ -28,7 +29,6 @@ load(
     "variable_with_value",
     "with_feature_set",
 )
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/warnings.bazelrc b/third_party/xla/warnings.bazelrc
new file mode 100644
index 00000000000000..00e9d3f58028d2
--- /dev/null
+++ b/third_party/xla/warnings.bazelrc
@@ -0,0 +1,98 @@
+# This file is autogenerated! Do not edit!
+
+# Treat warnings as errors...
+build:warnings --copt=-Werror --host_copt=-Werror
+# ...and silence them outside of the workspace.
+build:warnings --per_file_copt=external/.*@-w
+# ...and silence them on host builds. There is no host_per_file_copt and
+# everything we build in the host configuration we either also build in the
+# target configuration or is external, so we can't control it.
+# If/when Bazel supports --host_per_file_copt, we could use that instead:
+# https://github.com/bazelbuild/bazel/issues/12406.
+# Would need to then make all the --copt below duplicated with --host_copt.
+build:warnings --host_copt=-w
+
+build:warnings --copt=-Wall
+build:warnings --copt=-Werror
+build:warnings --copt=-Wno-address-of-packed-member
+build:warnings --copt=-Wno-defaulted-function-deleted
+build:warnings --copt=-Wno-enum-compare-switch
+build:warnings --copt=-Wno-expansion-to-defined
+build:warnings --copt=-Wno-ignored-attributes
+build:warnings --copt=-Wno-ignored-qualifiers
+build:warnings --copt=-Wno-inconsistent-missing-override
+build:warnings --copt=-Wno-potentially-evaluated-expression
+build:warnings --copt=-Wno-range-loop-analysis
+build:warnings --copt=-Wno-strict-prototypes
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-tautological-undefined-compare
+build:warnings --copt=-Wno-tautological-unsigned-zero-compare
+build:warnings --copt=-Wno-tautological-unsigned-enum-zero-compare
+build:warnings --copt=-Wno-undefined-func-template
+build:warnings --copt=-Wno-unused-but-set-variable
+build:warnings --copt=-Wno-unused-lambda-capture
+build:warnings --copt=-Wno-unused-local-typedef
+build:warnings --copt=-Wno-deprecated-builtins
+build:warnings --copt=-Wno-deprecated-volatile
+build:warnings --copt=-Wno-deprecated-anon-enum-enum-conversion
+build:warnings --copt=-Wno-deprecated-enum-compare
+build:warnings --copt=-Wno-deprecated-enum-enum-conversion
+build:warnings --copt=-Wno-deprecated-enum-compare-conditional
+build:warnings --copt=-Wno-deprecated-enum-float-conversion
+build:warnings --copt=-Wno-deprecated-this-capture
+build:warnings --copt=-Wno-deprecated-array-compare
+build:warnings --copt=-Wno-deprecated-comma-subscript
+build:warnings --copt=-Wno-bitfield-constant-conversion
+build:warnings --copt=-Wno-bitwise-instead-of-logical
+build:warnings --copt=-Wno-comment
+build:warnings --copt=-Wno-compound-token-split
+build:warnings --copt=-Wno-deprecated-non-prototype
+build:warnings --copt=-Wno-enum-constexpr-conversion
+build:warnings --copt=-Wno-misleading-indentation
+build:warnings --copt=-Wno-psabi
+build:warnings --copt=-Wno-unqualified-std-cast-call
+build:warnings --copt=-Wno-ambiguous-member-template
+build:warnings --copt=-Wno-char-subscripts
+build:warnings --copt=-Wno-deprecated-declarations
+build:warnings --copt=-Wno-deprecated-pragma
+build:warnings --copt=-Wno-extern-c-compat
+build:warnings --copt=-Wno-gnu-alignof-expression
+build:warnings --copt=-Wno-gnu-variable-sized-type-not-at-end
+build:warnings --copt=-Wno-implicit-int-float-conversion
+build:warnings --copt=-Wno-invalid-source-encoding
+build:warnings --copt=-Wno-mismatched-tags
+build:warnings --copt=-Wno-pointer-sign
+build:warnings --copt=-Wno-private-header
+build:warnings --copt=-Wno-sign-compare
+build:warnings --copt=-Wno-strict-overflow
+build:warnings --copt=-Wno-unknown-pragmas
+build:warnings --copt=-Wno-unused-command-line-argument
+build:warnings --copt=-Wno-unused-const-variable
+build:warnings --copt=-Wno-unused-function
+build:warnings --copt=-Wno-unused-private-field
+build:warnings --copt=-Wno-user-defined-warnings
+build:warnings --copt=-Wfloat-overflow-conversion
+build:warnings --copt=-Wfloat-zero-conversion
+build:warnings --copt=-Wfor-loop-analysis
+build:warnings --copt=-Wgnu-redeclared-enum
+build:warnings --copt=-Winfinite-recursion
+build:warnings --copt=-Wself-assign
+build:warnings --copt=-Wstring-conversion
+build:warnings --copt=-Wtautological-overlap-compare
+build:warnings --copt=-Wunused-but-set-parameter
+build:warnings --copt=-Wunused-comparison
+build:warnings --copt=-Wvla
+build:warnings --copt=-Wno-return-type-c-linkage
+build:warnings --copt=-Wno-self-assign-overloaded
+build:warnings --copt=-Wctad-maybe-unsupported
+build:warnings --copt=-Wthread-safety-beta
+build:warnings --copt=-Wno-trigraphs
+build:warnings --copt=-Woverloaded-virtual
+build:warnings --copt=-Wno-invalid-offsetof
+build:warnings --copt=-Wno-final-dtor-non-final-class
+build:warnings --copt=-Wnon-virtual-dtor
+build:warnings --copt=-Wimplicit-fallthrough
+build:warnings --copt=-Wthread-safety-analysis
+build:warnings --copt=-Wno-tautological-type-limit-compare
+build:warnings --copt=-Wno-builtin-macro-redefined
+build:warnings --copt=-Wno-macro-redefined
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 014ffbbebbbfed..76b8ed2bbae1f2 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@local_tsl//:workspace0.bzl", "tsl_workspace0")
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@bazel_toolchains//repositories:repositories.bzl", bazel_toolchains_repositories = "repositories")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
 load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
-load("@build_bazel_apple_support//lib:repositories.bzl", "apple_support_dependencies")
+load("@local_tsl//:workspace0.bzl", "tsl_workspace0")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
diff --git a/third_party/xla/workspace1.bzl b/third_party/xla/workspace1.bzl
index 961549f98a1001..90ab0e28809e5c 100644
--- a/third_party/xla/workspace1.bzl
+++ b/third_party/xla/workspace1.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@local_tsl//:workspace1.bzl", "tsl_workspace1")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
+load("@local_tsl//:workspace1.bzl", "tsl_workspace1")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
 
 # buildifier: disable=unnamed-macro
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 45245072af415d..9b9fd5e9265ed9 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -10,6 +10,7 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # Import third party repository rules. See go/tfbr-thirdparty.
 load("//third_party/dlpack:workspace.bzl", dlpack = "repo")
 load("//third_party/gloo:workspace.bzl", gloo = "repo")
+load("//third_party/mpitrampoline:workspace.bzl", mpitrampoline = "repo")
 load("//third_party/nanobind:workspace.bzl", nanobind = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
@@ -19,6 +20,7 @@ def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
     dlpack()
     gloo()
+    mpitrampoline()
     nanobind()
     robin_map()
     stablehlo()
@@ -38,9 +40,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
-        strip_prefix = "cudnn-frontend-1.1.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
+        sha256 = "1bb309af98fe9aad81b6a14fd52acbd6566aacfd322fc5803f9a1b77fc681a27",
+        strip_prefix = "cudnn-frontend-1.2.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.2.1.zip"),
     )
 
     tf_http_archive(
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 5d2be770ff3e4c..b9434882faf5c1 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,6 +1,3 @@
-# Placeholder: load py_proto_library
-load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
-load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
@@ -8,6 +5,10 @@ load(
     "tf_proto_library",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
+
+# Placeholder: load py_proto_library
+load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -25,6 +26,7 @@ package_group(
         "//third_party/mira/...",
         "//third_party/mlcompass/...",
         "//third_party/mlir_edge/model_curriculum/...",
+        "//third_party/openxla/shardonnay/...",
         "//third_party/py/enzyme_ad/...",
         "//third_party/py/jax/...",
         "//third_party/py/t5x/...",
@@ -248,9 +250,7 @@ xla_cc_test(
     name = "types_test",
     size = "small",
     srcs = ["types_test.cc"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
+    visibility = ["//visibility:private"],
     deps = [
         ":test",
         ":types",
@@ -602,11 +602,13 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto_cc",
+        "//xla/tsl/util:byte_swap_array",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:config",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -619,7 +621,6 @@ cc_library(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:byte_swap_array",
     ],
 )
 
@@ -869,11 +870,10 @@ cc_library(
     hdrs = ["test_helpers.h"],
     visibility = internal_visibility([":friends"]),
     deps = [
+        ":status",
         ":statusor",
         ":types",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -1071,12 +1071,12 @@ cc_library(
     deps =
         [
             ":types",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
             "@com_google_absl//absl/types:span",
             "@local_tsl//tsl/platform:logging",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1086,12 +1086,12 @@ xla_cc_test(
     deps =
         [
             ":parse_flags_from_env",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/strings:str_format",
             "@local_tsl//tsl/platform:env",
             "@local_tsl//tsl/platform:logging",
             "@local_tsl//tsl/platform:subprocess",
             "@local_tsl//tsl/platform:test",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1109,6 +1109,7 @@ cc_library(
             ":parse_flags_from_env",
             ":xla_proto_cc",
             "//xla/stream_executor/cuda:ptx_compiler_support",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/algorithm:container",
             "@com_google_absl//absl/base",
             "@com_google_absl//absl/container:flat_hash_map",
@@ -1117,7 +1118,6 @@ cc_library(
             "@com_google_absl//absl/strings:str_format",
             "@local_tsl//tsl/platform:logging",
             "@local_tsl//tsl/platform:protobuf",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
 )
 
@@ -1253,4 +1253,5 @@ cc_library(
 alias(
     name = "bazel_issue_21519",
     actual = ":empty",
+    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index af534da5582e5f..6cfd1f1a51815b 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -1,10 +1,10 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -126,7 +126,6 @@ xla_cc_test(
     srcs = ["host_tracer_test.cc"],
     deps = [
         ":host_tracer_impl",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc b/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
index 2d71ca5286ad42..881f46e50837ff 100644
--- a/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/host_tracer_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 35094fd55c1839..5ea81b05c3276d 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -1,8 +1,4 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -23,6 +19,10 @@ load(
     "if_cuda_is_configured",
 )
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -37,10 +37,12 @@ tsl_gpu_library(
         ":cupti_collector",
         ":cupti_tracer",
         ":cupti_wrapper",
+        ":rocm_collector",
         ":rocm_tracer",
     ],
     deps = [
         ":cupti_utils",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -51,7 +53,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
-        "@local_tsl//tsl/util:env_var",
     ],
     alwayslink = 1,
 )
@@ -220,6 +221,37 @@ tsl_gpu_library(
     ],
 )
 
+tsl_gpu_library(
+    name = "rocm_collector",
+    srcs = if_rocm(["rocm_collector.cc"]),
+    hdrs = if_rocm(["rocm_collector.h"]),
+    copts = tf_profiler_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/stream_executor/rocm:roctracer_wrapper",
+        "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
+        "@local_tsl//tsl/profiler/lib:profiler_factory",
+        "@local_tsl//tsl/profiler/lib:profiler_interface",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+        "@local_tsl//tsl/profiler/utils:parse_annotation",
+        "@local_tsl//tsl/profiler/utils:time_utils",
+        "@local_tsl//tsl/profiler/utils:trace_utils",
+        "@local_tsl//tsl/profiler/utils:xplane_builder",
+        "@local_tsl//tsl/profiler/utils:xplane_schema",
+        "@local_tsl//tsl/profiler/utils:xplane_utils",
+    ],
+)
+
 tsl_gpu_library(
     name = "rocm_tracer",
     srcs = if_rocm(["rocm_tracer.cc"]),
@@ -227,6 +259,7 @@ tsl_gpu_library(
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":rocm_collector",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -239,6 +272,7 @@ tsl_gpu_library(
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
+        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:time_utils",
     ],
 )
@@ -295,7 +329,7 @@ tsl_gpu_library(
         "@com_google_absl//absl/memory",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:stringpiece",
-        "@local_tsl//tsl/util:env_var",
+        "//xla/tsl/util:env_var",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
index ee9a542485a48c..a4198811286bed 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_error_manager.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/stringpiece.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index 70530221b68123..d7bb2524b66762 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/thread_annotations.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 #include "tsl/profiler/utils/time_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 6be03d3fa40d7f..81eb2d192ea09a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,7 +24,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/abi.h"
 #include "tsl/platform/env_time.h"
 #include "tsl/platform/errors.h"
@@ -38,16 +40,15 @@ limitations under the License.
 #include "tsl/profiler/utils/xplane_builder.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_utils.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
 
-using absl::OkStatus;
-using absl::Status;
 using tensorflow::ProfileOptions;
 using tsl::mutex;
 using tsl::mutex_lock;
+using tsl::OkStatus;
+using tsl::Status;
 using tsl::profiler::Annotation;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
@@ -65,712 +66,6 @@ using tsl::profiler::XLineBuilder;
 using tsl::profiler::XPlaneBuilder;
 using tsl::profiler::XSpace;
 
-namespace {
-// Set the all XLines of specified XPlane to starting walltime.
-// Events time in both host and device planes are CUTPI timestamps.
-// We set initial RocmTracer timestamp as start time for all lines to reflect
-// this fact. Eventually we change line start time to corresponding
-// start_walltime_ns to normalize with CPU wall time.
-static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                uint64_t start_walltime_ns) {
-  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
-    line.SetTimestampNs(start_walltime_ns);
-  });
-}
-
-std::string GetDeviceXLineName(
-    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
-  std::string line_name = absl::StrCat("Stream #", stream_id);
-  event_types.erase(RocmTracerEventType::Unsupported);
-  if (event_types.empty()) return line_name;
-  std::vector<const char*> type_names;
-  for (const auto event_type : event_types) {
-    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
-  }
-  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
-}
-
-}  // namespace
-
-class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
- public:
-  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
-      : RocmTraceCollector(options),
-        num_callback_events_(0),
-        num_activity_events_(0),
-        start_walltime_ns_(start_walltime_ns),
-        start_gputime_ns_(start_gputime_ns),
-        num_gpus_(options.num_gpus),
-        per_device_collector_(options.num_gpus) {}
-
-  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override {
-    mutex_lock lock(event_maps_mutex_);
-
-    if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
-      if (num_callback_events_ > options_.max_callback_api_events) {
-        OnEventsDropped("max callback event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_callback_events_++;
-    } else if (event.source == RocmTracerEventSource::Activity &&
-               event.domain == RocmTracerEventDomain::HIP_API) {
-      // we do not count HIP_OPS activities.
-      if (num_activity_events_ > options_.max_activity_api_events) {
-        OnEventsDropped("max activity event capacity reached",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-        return;
-      }
-      num_activity_events_++;
-    }
-
-    bool emplace_result = false;
-    if (event.source == RocmTracerEventSource::ApiCallback) {
-      auto& target_api_event_map =
-          (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
-      std::tie(std::ignore, emplace_result) =
-          target_api_event_map.emplace(event.correlation_id, std::move(event));
-    } else if (event.source == RocmTracerEventSource::Activity) {
-      auto result = activity_ops_events_map_.emplace(
-          event.correlation_id, std::vector<RocmTracerEvent>{});
-      result.first->second.push_back(std::move(event));
-      emplace_result = true;  // we always accept Hip-Ops events
-    }
-    if (!emplace_result) {
-      OnEventsDropped("event with duplicate correlation_id was received.",
-                      event.correlation_id);
-      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-    }
-  }
-
-  void OnEventsDropped(const std::string& reason,
-                       uint32_t correlation_id) override {
-    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
-              << ",) : " << reason << ".";
-  }
-
-  void Flush() override {
-    mutex_lock lock(event_maps_mutex_);
-    auto& aggregated_events_ = ApiActivityInfoExchange();
-
-    VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
-            << " callback events, " << num_activity_events_
-            << " activity events, and aggregated them into "
-            << aggregated_events_.size() << " events.";
-
-    // device ids for GPUs filled in by roctracer are not zero indexed.
-    // They are offset by number of CPUs on the machine
-    tsl::uint32 min_device_id = INT32_MAX;
-    for (auto& event : aggregated_events_) {
-      if (event.device_id < min_device_id) {
-        min_device_id = event.device_id;
-      }
-    }
-
-    for (auto event : aggregated_events_) {
-      event.device_id = event.device_id - min_device_id;
-      if (event.device_id < num_gpus_) {
-        per_device_collector_[event.device_id].AddEvent(event);
-      } else {
-        OnEventsDropped("Invalid device id for an event.",
-                        event.correlation_id);
-        DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
-      }
-    }
-
-    activity_ops_events_map_.clear();
-    api_events_map_.clear();
-    auxiliary_api_events_map_.clear();
-  }
-
-  void Export(XSpace* space) {
-    uint64_t end_gputime_ns = RocmTracer::GetTimestamp();
-    XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
-        space, tsl::profiler::kRoctracerApiPlaneName));
-    for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
-      std::string name = GpuPlaneName(device_ordinal);
-      XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
-      device_plane.SetId(device_ordinal);
-      // Calculate device capabilities before flushing, so that device
-      // properties are available to the occupancy calculator in export().
-      per_device_collector_[device_ordinal].GetDeviceCapabilities(
-          device_ordinal, &device_plane);
-      per_device_collector_[device_ordinal].Export(
-          start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
-          &host_plane);
-      NormalizeTimeStamps(&device_plane, start_walltime_ns_);
-    }
-    NormalizeTimeStamps(&host_plane, start_walltime_ns_);
-  }
-
- private:
-  std::atomic<int> num_callback_events_;
-  std::atomic<int> num_activity_events_;
-  uint64_t start_walltime_ns_;
-  uint64_t start_gputime_ns_;
-  int num_gpus_;
-
-  mutex event_maps_mutex_;
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
-   trigger multiple HIP ops domain activities. We keep them in a vector and
-   merge them with api activities at flush time.
- */
-  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
-      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
-  // This is for the APIs that we track because we need some information from
-  // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
-      TF_GUARDED_BY(event_maps_mutex_);
-
-  const std::vector<RocmTracerEvent> ApiActivityInfoExchange() {
-    /* Different from CUDA, roctracer activity records are not enough to fill a
-      TF event. For most of the activities, we need to enable the corresponding
-      API callsbacks (we call them auxiliary API callbacks) to capture the
-      necessary fields from them using the correlation id. The purpose of this
-      function is to let APIs and activities exchange information to reach a
-      state very similar to TF CUDA and getting ready to dump the event.
-    */
-
-    std::vector<RocmTracerEvent> aggregated_events;
-
-    // Copy info from activity events to API callback events
-    for (auto& api_iter : api_events_map_) {
-      RocmTracerEvent& api_event = api_iter.second;
-      auto activity_event =
-          activity_ops_events_map_.find(api_event.correlation_id);
-
-      if (activity_event == activity_ops_events_map_.end()) {
-        OnEventsDropped(
-            "An event from HIP API discarded."
-            "Could not find the counterpart activity.",
-            api_event.correlation_id);
-        DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-      } else {
-        api_event.device_id = activity_event->second.front().device_id;
-        api_event.stream_id = activity_event->second.front().stream_id;
-        switch (api_event.type) {
-          case RocmTracerEventType::Kernel:
-          case RocmTracerEventType::Memset:
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree:
-          case RocmTracerEventType::Synchronization: {
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            api_event.memcpy_info.destination =
-                activity_event->second.front().device_id;
-            aggregated_events.push_back(api_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                api_event.correlation_id);
-            DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm API event type with unimplemented activity "
-                            "merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(api_event.type);
-        }
-      }
-    }
-
-    // Make sure for all activity events we have API callback events
-    for (auto& activity_iter : activity_ops_events_map_) {
-      RocmTracerEvent& activity_event = activity_iter.second.front();
-      auto api_event = api_events_map_.find(activity_event.correlation_id);
-
-      if (api_event == api_events_map_.end()) {
-        api_event =
-            auxiliary_api_events_map_.find(activity_event.correlation_id);
-      }
-
-      if (api_event == auxiliary_api_events_map_.end()) {
-        OnEventsDropped(
-            "An event from activity was discarded."
-            "Could not find the counterpart HIP API.",
-            activity_event.correlation_id);
-        DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-      } else {
-        switch (activity_event.type) {
-          // KERNEL ACTIVITY
-          case RocmTracerEventType::Kernel: {
-            activity_event.name = api_event->second.name;
-            activity_event.kernel_info = api_event->second.kernel_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMCPY ACTIVITY
-          case RocmTracerEventType::MemcpyD2H:
-          case RocmTracerEventType::MemcpyH2D:
-          case RocmTracerEventType::MemcpyD2D:
-          case RocmTracerEventType::MemcpyOther: {
-            activity_event.memcpy_info = api_event->second.memcpy_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MEMSET ACTIVITY
-          case RocmTracerEventType::Memset: {
-            activity_event.memset_info = api_event->second.memset_info;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // MALLOC ACTIVITY, FREE ACTIVITY
-          case RocmTracerEventType::MemoryAlloc:
-          case RocmTracerEventType::MemoryFree: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          // SYNCHRONIZATION ACTIVITY
-          case RocmTracerEventType::Synchronization: {
-            activity_event.device_id = api_event->second.device_id;
-            aggregated_events.push_back(activity_event);
-            break;
-          }
-          default:
-            OnEventsDropped(
-                "Missing API-Activity information exchange. Dropped!",
-                activity_event.correlation_id);
-            DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
-            LOG(WARNING) << "A ROCm activity event with unimplemented API "
-                            "callback merge dropped! "
-                            "Type="
-                         << GetRocmTracerEventTypeName(activity_event.type);
-            break;
-        }
-      }
-    }
-
-    return aggregated_events;
-  }
-  struct RocmDeviceOccupancyParams {
-    hipFuncAttributes attributes = {};
-    int block_size = 0;
-    size_t dynamic_smem_size = 0;
-    void* func_ptr;
-
-    friend bool operator==(const RocmDeviceOccupancyParams& lhs,
-                           const RocmDeviceOccupancyParams& rhs) {
-      return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
-    }
-
-    template <typename H>
-    friend H AbslHashValue(H hash_state,
-                           const RocmDeviceOccupancyParams& params) {
-      return H::combine(
-          std::move(hash_state), params.attributes.maxThreadsPerBlock,
-          params.attributes.numRegs, params.attributes.sharedSizeBytes,
-          params.attributes.maxDynamicSharedSizeBytes, params.block_size,
-          params.dynamic_smem_size, params.func_ptr);
-    }
-  };
-
-  struct OccupancyStats {
-    double occupancy_pct = 0.0;
-    int min_grid_size = 0;
-    int suggested_block_size = 0;
-  };
-  struct CorrelationInfo {
-    CorrelationInfo(uint32_t t, uint32_t e)
-        : thread_id(t), enqueue_time_ns(e) {}
-    uint32_t thread_id;
-    uint64_t enqueue_time_ns;
-  };
-
-  struct PerDeviceCollector {
-    void GetDeviceCapabilities(int32_t device_ordinal,
-                               XPlaneBuilder* device_plane) {
-      device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
-                                     GetStatTypeStr(StatType::kDevVendor)),
-                                 kDeviceVendorAMD);
-
-      if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
-          hipSuccess)
-        return;
-
-      auto clock_rate_in_khz =
-          device_properties_.clockRate;  // this is also in Khz
-      if (clock_rate_in_khz) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapClockRateKHz)),
-            clock_rate_in_khz);
-      }
-
-      auto core_count = device_properties_.multiProcessorCount;
-      if (core_count) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapCoreCount)),
-            core_count);
-      }
-
-      auto mem_clock_khz = device_properties_.memoryClockRate;
-      auto mem_bus_width_bits = device_properties_.memoryBusWidth;
-
-      if (mem_clock_khz && mem_bus_width_bits) {
-        // Times 2 because HBM is DDR memory; it gets two data bits per each
-        // data lane.
-        auto memory_bandwidth =
-            uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
-            memory_bandwidth);
-      }
-
-      size_t total_memory = device_properties_.totalGlobalMem;
-      if (total_memory) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<uint64_t>(total_memory));
-      }
-
-      auto compute_capability_major = device_properties_.major;
-      if (compute_capability_major) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
-            compute_capability_major);
-      }
-      auto compute_capability_minor = device_properties_.minor;
-      if (compute_capability_minor) {
-        device_plane->AddStatValue(
-            *device_plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
-            compute_capability_minor);
-      }
-    }
-
-    inline std::string ToXStat(const KernelDetails& kernel_info,
-                               double occupancy_pct) {
-      return absl::StrCat(
-          "regs:", kernel_info.registers_per_thread,
-          " static_shared:", kernel_info.static_shared_memory_usage,
-          " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
-          " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
-          kernel_info.grid_z, " block:", kernel_info.block_x, ",",
-          kernel_info.block_y, ",", kernel_info.block_z,
-          " occ_pct:", occupancy_pct);
-    }
-    OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
-      // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
-      // return hipSuccess for HIP_API_ID_hipLaunchKernel
-
-      OccupancyStats stats;
-      int number_of_active_blocks;
-      hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
-          &number_of_active_blocks, params.func_ptr, params.block_size,
-          params.dynamic_smem_size);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
-      stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
-
-      err = hipOccupancyMaxPotentialBlockSize(
-          &stats.min_grid_size, &stats.suggested_block_size,
-          (const void*)params.func_ptr,
-          params.dynamic_smem_size, 0);
-
-      if (err != hipError_t::hipSuccess) {
-        return {};
-      }
-
-      return stats;
-    }
-    void AddEvent(const RocmTracerEvent& event) {
-      mutex_lock l(events_mutex);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        // Cupti api callback events were used to populate launch times etc.
-        if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-          correlation_info_.insert(
-              {event.correlation_id,
-               CorrelationInfo(event.thread_id, event.start_time_ns)});
-        }
-        events.emplace_back(std::move(event));
-      } else {
-        // Cupti activity events measure device times etc.
-        events.emplace_back(std::move(event));
-      }
-    }
-
-    void SortByStartTime() {
-      mutex_lock lock(events_mutex);
-      std::sort(
-          events.begin(), events.end(),
-          [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
-            return event1.start_time_ns < event2.start_time_ns;
-          });
-    }
-
-    void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
-                      uint64_t start_gpu_ns, uint64_t end_gpu_ns,
-                      XLineBuilder* line) {
-      if (event.start_time_ns < start_gpu_ns ||
-          event.end_time_ns > end_gpu_ns ||
-          event.start_time_ns > event.end_time_ns) {
-        VLOG(2) << "events have abnormal timestamps:" << event.name
-                << " start time(ns): " << event.start_time_ns
-                << " end time(ns): " << event.end_time_ns
-                << " start gpu(ns):" << start_gpu_ns
-                << " end gpu(ns):" << end_gpu_ns
-                << " corr. id:" << event.correlation_id;
-        return;
-      }
-      std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
-      if (kernel_name.empty()) {
-        kernel_name = GetRocmTracerEventTypeName(event.type);
-      }
-      XEventMetadata* event_metadata =
-          plane->GetOrCreateEventMetadata(std::move(kernel_name));
-      XEventBuilder xevent = line->AddEvent(*event_metadata);
-      VLOG(7) << "Adding event to line=" << line->Id();
-      xevent.SetTimestampNs(event.start_time_ns);
-      xevent.SetEndTimestampNs(event.end_time_ns);
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kDeviceId)),
-                            event.device_id);
-      }
-      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kCorrelationId)),
-                            event.correlation_id);
-      }
-      if (!event.roctx_range.empty()) {
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kNVTXRange)),
-                            *plane->GetOrCreateStatMetadata(event.roctx_range));
-      }
-      // if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
-      //   xevent.AddStatValue(
-      //       *plane->GetOrCreateStatMetadata(
-      //           GetStatTypeStr(StatType::kContextId)),
-      //       absl::StrCat("$$", static_cast<uint64_t>(event.context_id)));
-      // }
-
-      if (event.type == RocmTracerEventType::Kernel &&
-          event.source == RocmTracerEventSource::Activity) {
-        RocmDeviceOccupancyParams params{};
-        params.attributes.maxThreadsPerBlock = INT_MAX;
-        params.attributes.numRegs =
-            static_cast<int>(event.kernel_info.registers_per_thread);
-        params.attributes.sharedSizeBytes =
-            event.kernel_info.static_shared_memory_usage;
-        // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
-        // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
-        params.attributes.maxDynamicSharedSizeBytes = 0;
-        params.block_size = static_cast<int>(event.kernel_info.block_x *
-                                             event.kernel_info.block_y *
-                                             event.kernel_info.block_z);
-
-        params.dynamic_smem_size =
-            event.kernel_info.dynamic_shared_memory_usage;
-        params.func_ptr = event.kernel_info.func_ptr;
-
-        OccupancyStats& occ_stats = occupancy_cache_[params];
-        if (occ_stats.occupancy_pct == 0.0) {
-          occ_stats = GetOccupancy(params);
-        }
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kTheoreticalOccupancyPct)),
-                            occ_stats.occupancy_pct);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kOccupancyMinGridSize)),
-                            static_cast<int32_t>(occ_stats.min_grid_size));
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-            static_cast<int32_t>(occ_stats.suggested_block_size));
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            *plane->GetOrCreateStatMetadata(ToXStat(
-                                event.kernel_info, occ_stats.occupancy_pct)));
-      } else if (event.type == RocmTracerEventType::MemcpyH2D ||
-                 event.type == RocmTracerEventType::MemcpyD2H ||
-                 event.type == RocmTracerEventType::MemcpyD2D ||
-                 event.type == RocmTracerEventType::MemcpyP2P ||
-                 event.type == RocmTracerEventType::MemcpyOther) {
-        VLOG(7) << "Add Memcpy stat";
-        const auto& memcpy_info = event.memcpy_info;
-        std::string memcpy_details = absl::StrCat(
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            "kind:", "Unknown", " size:", memcpy_info.num_bytes,
-            " dest:", memcpy_info.destination, " async:", memcpy_info.async);
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(
-                GetStatTypeStr(StatType::kMemcpyDetails)),
-            *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
-      } else if (event.type == RocmTracerEventType::MemoryAlloc) {
-        VLOG(7) << "Add MemAlloc stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemallocDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::MemoryFree) {
-        VLOG(7) << "Add MemFree stat";
-        std::string value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memalloc_info.num_bytes);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemFreeDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } else if (event.type == RocmTracerEventType::Memset) {
-        VLOG(7) << "Add Memset stat";
-        auto value =
-            // TODO(rocm-profiler): we need to discover the memory kind similar
-            // to CUDA
-            absl::StrCat("kind:", "Unknown",
-                         " num_bytes:", event.memset_info.num_bytes,
-                         " async:", event.memset_info.async);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kMemsetDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      }
-      // TODO(rocm-profiler): we need to support the following event type
-      /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
-        VLOG(7) << "Add MemoryResidency stat";
-        std::string value = absl::StrCat(
-            "kind:", GetMemoryKindName(event.memory_residency_info.kind),
-            " num_bytes:", event.memory_residency_info.num_bytes,
-            " addr:", event.memory_residency_info.address);
-        xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
-                                StatType::kMemoryResidencyDetails)),
-                            *plane->GetOrCreateStatMetadata(std::move(value)));
-      } */
-
-      std::vector<Annotation> annotation_stack =
-          ParseAnnotationStack(event.annotation);
-      if (!annotation_stack.empty()) {
-        xevent.AddStatValue(
-            *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
-            *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
-      }
-      // If multiple metadata have the same key name, show the values from the
-      // top of the stack (innermost annotation). Concatenate the values from
-      // "hlo_op".
-      absl::flat_hash_set<absl::string_view> key_set;
-
-      for (auto annotation = annotation_stack.rbegin();
-           annotation != annotation_stack.rend(); ++annotation) {
-        for (const Annotation::Metadata& metadata : annotation->metadata) {
-          if (key_set.insert(metadata.key).second) {
-            xevent.ParseAndAddStatValue(
-                *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
-          }
-        }
-      }
-    }
-    bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
-      // DriverCallback(i.e. kernel launching) events are host events.
-      if (event.source == RocmTracerEventSource::ApiCallback) {
-        *line_id = event.thread_id;
-        return true;
-      } else {  // activities
-        *line_id = event.stream_id;
-        return false;
-      }
-
-      // TODO(rocm-profiler): do we have such a report in rocm?
-      // Non-overhead activity events are device events.
-      /* if (event.type != CuptiTracerEventType::Overhead) {
-        *line_id = event.stream_id;
-        return false;
-      } */
-      // Overhead events can be associated with a thread or a stream, etc.
-      // If a valid thread id is specified, we consider it as a host event.
-      //
-
-      if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
-        *line_id = event.stream_id;
-        return false;
-      } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
-                 event.thread_id != 0) {
-        *line_id = event.thread_id;
-        return true;
-      } else {
-        *line_id = tsl::profiler::kThreadIdOverhead;
-        return false;
-      }
-    }
-    void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
-                XPlaneBuilder* host_plane) {
-      int host_ev_cnt = 0, dev_ev_cnt = 0;
-      mutex_lock l(events_mutex);
-      // Tracking event types per line.
-      absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
-          events_types_per_line;
-      for (const RocmTracerEvent& event : events) {
-        int64_t line_id = RocmTracerEvent::kInvalidThreadId;
-        bool is_host_event = IsHostEvent(event, &line_id);
-
-        if (is_host_event) {
-          host_ev_cnt++;
-        } else {
-          dev_ev_cnt++;
-        }
-
-        if (line_id == RocmTracerEvent::kInvalidThreadId ||
-            line_id == RocmTracerEvent::kInvalidStreamId) {
-          VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
-          continue;
-        }
-        auto* plane = is_host_event ? host_plane : device_plane;
-        VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
-                << " line_id=" << line_id
-                << (is_host_event ? " host plane=" : " device plane=")
-                << plane->Name();
-        XLineBuilder line = plane->GetOrCreateLine(line_id);
-        line.SetTimestampNs(start_gputime_ns);
-        CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
-        events_types_per_line[line_id].emplace(event.type);
-      }
-      device_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(
-            GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
-      });
-      host_plane->ForEachLine([&](XLineBuilder line) {
-        line.SetName(absl::StrCat("Host Threads/", line.Id()));
-      });
-      events.clear();
-    }
-
-    mutex events_mutex;
-    std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
-        TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
-        occupancy_cache_;
-    hipDeviceProp_t device_properties_;
-  };
-
-  absl::flat_hash_map<const uint32_t, PerDeviceCollector>
-      per_device_collector_;
-};
-
 // GpuTracer for ROCm GPU.
 class GpuTracer : public profiler::ProfilerInterface {
  public:
@@ -802,7 +97,7 @@ class GpuTracer : public profiler::ProfilerInterface {
   State profiling_state_ = State::kNotStarted;
 
   RocmTracer* rocm_tracer_;
-  std::unique_ptr<RocmTraceCollectorImpl> rocm_trace_collector_;
+  std::unique_ptr<RocmTraceCollector> rocm_trace_collector_;
 };
 
 RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
@@ -900,8 +195,12 @@ Status GpuTracer::DoStart() {
       GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
   uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
   uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
-  rocm_trace_collector_ = std::make_unique<RocmTraceCollectorImpl>(
+  rocm_trace_collector_ = CreateRocmCollector(
       trace_collector_options, start_walltime_ns, start_gputime_ns);
+  // rocm_trace_collector_ =
+  // std::make_unique<RocmTraceCollectorImpl>(trace_collector_options,
+  // start_walltime_ns,
+  //                                                  start_gputime_ns);
 
   RocmTracerOptions tracer_options = GetRocmTracerOptions();
   rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
new file mode 100644
index 00000000000000..66f8000f6d0d52
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc
@@ -0,0 +1,853 @@
+
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/profiler/gpu/rocm_collector.h"
+
+#include "absl/container/fixed_array.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/optional.h"
+#include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "xla/tsl/util/env_var.h"
+#include "tsl/platform/abi.h"
+#include "tsl/platform/env_time.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/mutex.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
+#include "tsl/platform/types.h"
+#include "tsl/profiler/backends/cpu/annotation_stack.h"
+#include "tsl/profiler/lib/profiler_factory.h"
+#include "tsl/profiler/lib/profiler_interface.h"
+#include "tsl/profiler/utils/parse_annotation.h"
+#include "tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/utils/xplane_utils.h"
+
+namespace xla {
+namespace profiler {
+
+namespace se = ::stream_executor;
+using tensorflow::ProfileOptions;
+using tsl::mutex;
+using tsl::mutex_lock;
+// using tsl::OkStatus;
+using tsl::Status;
+using tsl::profiler::Annotation;
+using tsl::profiler::AnnotationStack;
+using tsl::profiler::FindOrAddMutablePlaneWithName;
+using tsl::profiler::GetStatTypeStr;
+using tsl::profiler::GpuPlaneName;
+using tsl::profiler::kDeviceVendorAMD;
+using tsl::profiler::kThreadIdOverhead;
+using tsl::profiler::ParseAnnotationStack;
+using tsl::profiler::ProfilerInterface;
+// using tsl::profiler::RegisterProfilerFactory;
+using tsl::profiler::StatType;
+using tsl::profiler::XEventBuilder;
+using tsl::profiler::XEventMetadata;
+using tsl::profiler::XLineBuilder;
+using tsl::profiler::XPlaneBuilder;
+using tsl::profiler::XSpace;
+
+void AnnotationMap::Add(uint32_t correlation_id,
+                        const std::string& annotation) {
+  if (annotation.empty()) return;
+  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
+          << ", annotation: " << annotation;
+  absl::MutexLock lock(&map_.mutex);
+  if (map_.annotations.size() < max_size_) {
+    absl::string_view annotation_str =
+        *map_.annotations.insert(annotation).first;
+    map_.correlation_map.emplace(correlation_id, annotation_str);
+  }
+}
+
+absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
+  absl::MutexLock lock(&map_.mutex);
+  auto it = map_.correlation_map.find(correlation_id);
+  return it != map_.correlation_map.end() ? it->second : absl::string_view();
+}
+
+//==========
+namespace {
+// Set the all XLines of specified XPlane to starting walltime.
+// Events time in both host and device planes are CUTPI timestamps.
+// We set initial RocmTracer timestamp as start time for all lines to reflect
+// this fact. Eventually we change line start time to corresponding
+// start_walltime_ns to normalize with CPU wall time.
+static void NormalizeTimeStamps(XPlaneBuilder* plane,
+                                uint64_t start_walltime_ns) {
+  plane->ForEachLine([&](tsl::profiler::XLineBuilder line) {
+    line.SetTimestampNs(start_walltime_ns);
+  });
+}
+
+std::string GetDeviceXLineName(
+    int64_t stream_id, absl::flat_hash_set<RocmTracerEventType>& event_types) {
+  std::string line_name = absl::StrCat("Stream #", stream_id);
+  event_types.erase(RocmTracerEventType::Unsupported);
+  if (event_types.empty()) return line_name;
+  std::vector<const char*> type_names;
+  for (const auto event_type : event_types) {
+    type_names.emplace_back(GetRocmTracerEventTypeName(event_type));
+  }
+  return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
+}
+
+}  // namespace
+
+static void DumpRocmTracerEvent(const RocmTracerEvent& event,
+                                uint64_t start_walltime_ns,
+                                uint64_t start_gputime_ns,
+                                const std::string& message) {
+  std::ostringstream oss;
+  oss << "correlation_id=" << event.correlation_id;
+  oss << ",type=" << GetRocmTracerEventTypeName(event.type);
+  oss << ",source=" << GetRocmTracerEventSourceName(event.source);
+  oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
+  oss << ",name=" << event.name;
+  oss << ",annotation=" << event.annotation;
+  oss << ",start_time_us="
+      << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
+  oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000;
+  oss << ",device_id=" << event.device_id;
+  oss << ",thread_id=" << event.thread_id;
+  oss << ",stream_id=" << event.stream_id;
+
+  switch (event.type) {
+    case RocmTracerEventType::Kernel:
+      break;
+    case RocmTracerEventType::MemcpyD2H:
+    case RocmTracerEventType::MemcpyH2D:
+    case RocmTracerEventType::MemcpyD2D:
+    case RocmTracerEventType::MemcpyP2P:
+      oss << ",num_bytes=" << event.memcpy_info.num_bytes;
+      oss << ",destination=" << event.memcpy_info.destination;
+      oss << ",async=" << event.memcpy_info.async;
+      break;
+    case RocmTracerEventType::MemoryAlloc:
+      oss << ",num_bytes=" << event.memalloc_info.num_bytes;
+      break;
+    case RocmTracerEventType::Synchronization:
+      break;
+    case RocmTracerEventType::Generic:
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  oss << message;
+  VLOG(3) << oss.str();
+}
+
+static uint64_t get_timestamp() {
+  uint64_t ts;
+  if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) {
+    const char* errstr = se::wrap::roctracer_error_string();
+    LOG(ERROR) << "function roctracer_get_timestamp failed with error "
+               << errstr;
+    // Return 0 on error.
+    return 0;
+  }
+  return ts;
+}
+
+struct RocmDeviceOccupancyParams {
+  hipFuncAttributes attributes = {};
+  int block_size = 0;
+  size_t dynamic_smem_size = 0;
+  void* func_ptr;
+
+  friend bool operator==(const RocmDeviceOccupancyParams& lhs,
+                         const RocmDeviceOccupancyParams& rhs) {
+    return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H hash_state,
+                         const RocmDeviceOccupancyParams& params) {
+    return H::combine(
+        std::move(hash_state), params.attributes.maxThreadsPerBlock,
+        params.attributes.numRegs, params.attributes.sharedSizeBytes,
+        params.attributes.maxDynamicSharedSizeBytes, params.block_size,
+        params.dynamic_smem_size, params.func_ptr);
+  }
+};
+
+struct OccupancyStats {
+  double occupancy_pct = 0.0;
+  int min_grid_size = 0;
+  int suggested_block_size = 0;
+};
+
+struct CorrelationInfo {
+  CorrelationInfo(uint32_t t, uint32_t e) : thread_id(t), enqueue_time_ns(e) {}
+  uint32_t thread_id;
+  uint64_t enqueue_time_ns;
+};
+
+class PerDeviceCollector {
+ private:
+  OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const {
+    // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only
+    // return hipSuccess for HIP_API_ID_hipLaunchKernel
+
+    OccupancyStats stats;
+    int number_of_active_blocks;
+    hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor(
+        &number_of_active_blocks, params.func_ptr, params.block_size,
+        params.dynamic_smem_size);
+
+    if (err != hipError_t::hipSuccess) {
+      return {};
+    }
+
+    stats.occupancy_pct = number_of_active_blocks * params.block_size * 100;
+    stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor;
+
+    err = hipOccupancyMaxPotentialBlockSize(
+        &stats.min_grid_size, &stats.suggested_block_size, static_cast<const void*>(params.func_ptr),
+        params.dynamic_smem_size, 0);
+
+    if (err != hipError_t::hipSuccess) {
+      return {};
+    }
+
+    return stats;
+  }
+
+  void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane,
+                    uint64_t start_gpu_ns, uint64_t end_gpu_ns,
+                    XLineBuilder* line) {
+    if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
+        event.start_time_ns > event.end_time_ns) {
+      VLOG(2) << "events have abnormal timestamps:" << event.name
+              << " start time(ns): " << event.start_time_ns
+              << " end time(ns): " << event.end_time_ns
+              << " start gpu(ns):" << start_gpu_ns
+              << " end gpu(ns):" << end_gpu_ns
+              << " corr. id:" << event.correlation_id;
+      return;
+    }
+    std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str());
+    if (kernel_name.empty()) {
+      kernel_name = GetRocmTracerEventTypeName(event.type);
+    }
+    XEventMetadata* event_metadata =
+        plane->GetOrCreateEventMetadata(std::move(kernel_name));
+    XEventBuilder xevent = line->AddEvent(*event_metadata);
+    VLOG(7) << "Adding event to line=" << line->Id();
+    xevent.SetTimestampNs(event.start_time_ns);
+    xevent.SetEndTimestampNs(event.end_time_ns);
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
+          event.device_id);
+    }
+    if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kCorrelationId)),
+                          event.correlation_id);
+    }
+    if (!event.roctx_range.empty()) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
+          *plane->GetOrCreateStatMetadata(event.roctx_range));
+    }
+
+    if (event.type == RocmTracerEventType::Kernel &&
+        event.source == RocmTracerEventSource::Activity) {
+      RocmDeviceOccupancyParams params{};
+      params.attributes.maxThreadsPerBlock = INT_MAX;
+      params.attributes.numRegs =
+          static_cast<int>(event.kernel_info.registers_per_thread);
+      params.attributes.sharedSizeBytes =
+          event.kernel_info.static_shared_memory_usage;
+      // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
+      // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
+      params.attributes.maxDynamicSharedSizeBytes = 0;
+      params.block_size = static_cast<int>(event.kernel_info.block_x *
+                                           event.kernel_info.block_y *
+                                           event.kernel_info.block_z);
+
+      params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
+      params.func_ptr = event.kernel_info.func_ptr;
+
+      OccupancyStats& occ_stats = occupancy_cache_[params];
+      if (occ_stats.occupancy_pct == 0.0) {
+        occ_stats = GetOccupancy(params);
+      }
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kTheoreticalOccupancyPct)),
+                          occ_stats.occupancy_pct);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kOccupancyMinGridSize)),
+                          static_cast<tsl::int32>(occ_stats.min_grid_size));
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
+          static_cast<tsl::int32>(occ_stats.suggested_block_size));
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kKernelDetails)),
+                          *plane->GetOrCreateStatMetadata(ToXStat(
+                              event.kernel_info, occ_stats.occupancy_pct)));
+    } else if (event.type == RocmTracerEventType::MemcpyH2D ||
+               event.type == RocmTracerEventType::MemcpyD2H ||
+               event.type == RocmTracerEventType::MemcpyD2D ||
+               event.type == RocmTracerEventType::MemcpyP2P ||
+               event.type == RocmTracerEventType::MemcpyOther) {
+      VLOG(7) << "Add Memcpy stat";
+      const auto& memcpy_info = event.memcpy_info;
+      std::string memcpy_details = absl::StrCat(
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          "kind:", "Unknown", " size:", memcpy_info.num_bytes,
+          " dest:", memcpy_info.destination, " async:", memcpy_info.async);
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kMemcpyDetails)),
+          *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
+    } else if (event.type == RocmTracerEventType::MemoryAlloc) {
+      VLOG(7) << "Add MemAlloc stat";
+      std::string value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memalloc_info.num_bytes);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemallocDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } else if (event.type == RocmTracerEventType::MemoryFree) {
+      VLOG(7) << "Add MemFree stat";
+      std::string value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memalloc_info.num_bytes);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemFreeDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } else if (event.type == RocmTracerEventType::Memset) {
+      VLOG(7) << "Add Memset stat";
+      auto value =
+          // TODO(rocm-profiler): we need to discover the memory kind similar
+          // to CUDA
+          absl::StrCat("kind:", "Unknown",
+                       " num_bytes:", event.memset_info.num_bytes,
+                       " async:", event.memset_info.async);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
+                              GetStatTypeStr(StatType::kMemsetDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    }
+    // TODO(rocm-profiler): we need to support the following event type
+    /* else if (event.type == CuptiTracerEventType::MemoryResidency) {
+      VLOG(7) << "Add MemoryResidency stat";
+      std::string value = absl::StrCat(
+          "kind:", GetMemoryKindName(event.memory_residency_info.kind),
+          " num_bytes:", event.memory_residency_info.num_bytes,
+          " addr:", event.memory_residency_info.address);
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kMemoryResidencyDetails)),
+                          *plane->GetOrCreateStatMetadata(std::move(value)));
+    } */
+
+    std::vector<Annotation> annotation_stack =
+        ParseAnnotationStack(event.annotation);
+    if (!annotation_stack.empty()) {
+      xevent.AddStatValue(
+          *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
+          *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
+    }
+    // If multiple metadata have the same key name, show the values from the
+    // top of the stack (innermost annotation). Concatenate the values from
+    // "hlo_op".
+    absl::flat_hash_set<absl::string_view> key_set;
+
+    for (auto annotation = annotation_stack.rbegin();
+         annotation != annotation_stack.rend(); ++annotation) {
+      for (const Annotation::Metadata& metadata : annotation->metadata) {
+        if (key_set.insert(metadata.key).second) {
+          xevent.ParseAndAddStatValue(
+              *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
+        }
+      }
+    }
+  }
+
+  void SortByStartTime() {
+    mutex_lock lock(events_mutex);
+    std::sort(events.begin(), events.end(),
+              [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) {
+                return event1.start_time_ns < event2.start_time_ns;
+              });
+  }
+
+  bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) {
+    // DriverCallback(i.e. kernel launching) events are host events.
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      *line_id = event.thread_id;
+      return true;
+    } else {  // activities
+      *line_id = event.stream_id;
+      return false;
+    }
+
+    // TODO(rocm-profiler): do we have such a report in rocm?
+    // Non-overhead activity events are device events.
+    /* if (event.type != CuptiTracerEventType::Overhead) {
+      *line_id = event.stream_id;
+      return false;
+    } */
+    // Overhead events can be associated with a thread or a stream, etc.
+    // If a valid thread id is specified, we consider it as a host event.
+    //
+
+    if (event.stream_id != RocmTracerEvent::kInvalidStreamId) {
+      *line_id = event.stream_id;
+      return false;
+    } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId &&
+               event.thread_id != 0) {
+      *line_id = event.thread_id;
+      return true;
+    } else {
+      *line_id = tsl::profiler::kThreadIdOverhead;
+      return false;
+    }
+  }
+
+ public:
+  void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns,
+              uint64_t end_gputime_ns, XPlaneBuilder* device_plane,
+              XPlaneBuilder* host_plane) {
+    int host_ev_cnt = 0, dev_ev_cnt = 0;
+    mutex_lock l(events_mutex);
+    // Tracking event types per line.
+    absl::flat_hash_map<tsl::int64, absl::flat_hash_set<RocmTracerEventType>>
+        events_types_per_line;
+    for (const RocmTracerEvent& event : events) {
+      int64_t line_id = RocmTracerEvent::kInvalidThreadId;
+      bool is_host_event = IsHostEvent(event, &line_id);
+
+      if (is_host_event) {
+        host_ev_cnt++;
+      } else {
+        dev_ev_cnt++;
+      }
+
+      if (line_id == RocmTracerEvent::kInvalidThreadId ||
+          line_id == RocmTracerEvent::kInvalidStreamId) {
+        VLOG(3) << "Ignoring event, type=" << static_cast<int>(event.type);
+        continue;
+      }
+      auto* plane = is_host_event ? host_plane : device_plane;
+      VLOG(9) << "Event" << " type=" << static_cast<int>(event.type)
+              << " line_id=" << line_id
+              << (is_host_event ? " host plane=" : " device plane=")
+              << plane->Name();
+      XLineBuilder line = plane->GetOrCreateLine(line_id);
+      line.SetTimestampNs(start_gputime_ns);
+      CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line);
+      events_types_per_line[line_id].emplace(event.type);
+    }
+    device_plane->ForEachLine([&](XLineBuilder line) {
+      line.SetName(
+          GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
+    });
+    host_plane->ForEachLine([&](XLineBuilder line) {
+      line.SetName(absl::StrCat("Host Threads/", line.Id()));
+    });
+    events.clear();
+  }
+
+  PerDeviceCollector() = default;
+
+  void AddEvent(const RocmTracerEvent& event) {
+    mutex_lock l(events_mutex);
+    if (event.source == RocmTracerEventSource::ApiCallback) {
+      // Cupti api callback events were used to populate launch times etc.
+      if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) {
+        correlation_info_.insert(
+            {event.correlation_id,
+             CorrelationInfo(event.thread_id, event.start_time_ns)});
+      }
+      events.emplace_back(std::move(event));
+    } else {
+      // Cupti activity events measure device times etc.
+      events.emplace_back(std::move(event));
+    }
+  }
+
+  void GetDeviceCapabilities(int32_t device_ordinal,
+                             XPlaneBuilder* device_plane) {
+    device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata(
+                                   GetStatTypeStr(StatType::kDevVendor)),
+                               kDeviceVendorAMD);
+
+    if (hipGetDeviceProperties(&device_properties_, device_ordinal) !=
+        hipSuccess)
+      return;
+
+    auto clock_rate_in_khz =
+        device_properties_.clockRate;  // this is also in Khz
+    if (clock_rate_in_khz) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapClockRateKHz)),
+          clock_rate_in_khz);
+    }
+
+    auto core_count = device_properties_.multiProcessorCount;
+    if (core_count) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapCoreCount)),
+          core_count);
+    }
+
+    auto mem_clock_khz = device_properties_.memoryClockRate;
+    auto mem_bus_width_bits = device_properties_.memoryBusWidth;
+
+    if (mem_clock_khz && mem_bus_width_bits) {
+      // Times 2 because HBM is DDR memory; it gets two data bits per each
+      // data lane.
+      auto memory_bandwidth =
+          uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
+          memory_bandwidth);
+    }
+
+    size_t total_memory = device_properties_.totalGlobalMem;
+    if (total_memory) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapMemorySize)),
+          static_cast<uint64_t>(total_memory));
+    }
+
+    auto compute_capability_major = device_properties_.major;
+    if (compute_capability_major) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
+          compute_capability_major);
+    }
+    auto compute_capability_minor = device_properties_.minor;
+    if (compute_capability_minor) {
+      device_plane->AddStatValue(
+          *device_plane->GetOrCreateStatMetadata(
+              GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
+          compute_capability_minor);
+    }
+  }
+
+ private:
+  mutex events_mutex;
+  std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
+  absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
+      TF_GUARDED_BY(events_mutex);
+  absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
+      occupancy_cache_;
+  hipDeviceProp_t device_properties_;
+};
+
+class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
+ public:
+  RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options,
+                         uint64_t start_walltime_ns, uint64_t start_gputime_ns)
+      : RocmTraceCollector(options),
+        num_callback_events_(0),
+        num_activity_events_(0),
+        start_walltime_ns_(start_walltime_ns),
+        start_gputime_ns_(start_gputime_ns),
+        num_gpus_(options.num_gpus),
+        per_device_collector_(options.num_gpus) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override;
+  void Flush() override;
+  void Export(XSpace* space) override;
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t correlation_id) override {
+    LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id
+              << ",) : " << reason << ".";
+  }
+
+ private:
+  std::atomic<int> num_callback_events_;
+  std::atomic<int> num_activity_events_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gputime_ns_;
+  int num_gpus_;
+
+  mutex event_maps_mutex_;
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
+      TF_GUARDED_BY(event_maps_mutex_);
+
+  /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
+   trigger multiple HIP ops domain activities. We keep them in a vector and
+   merge them with api activities at flush time.
+ */
+  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
+      activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
+  // This is for the APIs that we track because we need some information from
+  // them to populate the corresponding activity that we actually track.
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
+      TF_GUARDED_BY(event_maps_mutex_);
+
+  const std::vector<RocmTracerEvent> ApiActivityInfoExchange();
+
+  absl::flat_hash_map<const uint32_t, PerDeviceCollector> per_device_collector_;
+};
+//==========
+
+void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event,
+                                      bool is_auxiliary) {
+  mutex_lock lock(event_maps_mutex_);
+
+  if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) {
+    if (num_callback_events_ > options_.max_callback_api_events) {
+      OnEventsDropped("max callback event capacity reached",
+                      event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      return;
+    }
+    num_callback_events_++;
+  } else if (event.source == RocmTracerEventSource::Activity &&
+             event.domain == RocmTracerEventDomain::HIP_API) {
+    // we do not count HIP_OPS activities.
+    if (num_activity_events_ > options_.max_activity_api_events) {
+      OnEventsDropped("max activity event capacity reached",
+                      event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+      return;
+    }
+    num_activity_events_++;
+  }
+
+  bool emplace_result = false;
+  if (event.source == RocmTracerEventSource::ApiCallback) {
+    auto& target_api_event_map =
+        (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_;
+    std::tie(std::ignore, emplace_result) =
+        target_api_event_map.emplace(event.correlation_id, std::move(event));
+  } else if (event.source == RocmTracerEventSource::Activity) {
+    auto result = activity_ops_events_map_.emplace(
+        event.correlation_id, std::vector<RocmTracerEvent>{});
+    result.first->second.push_back(std::move(event));
+    emplace_result = true;  // we always accept Hip-Ops events
+  }
+  if (!emplace_result) {
+    OnEventsDropped("event with duplicate correlation_id was received.",
+                    event.correlation_id);
+    DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+  }
+}
+
+void RocmTraceCollectorImpl::Flush() {
+  mutex_lock lock(event_maps_mutex_);
+  auto& aggregated_events_ = ApiActivityInfoExchange();
+
+  VLOG(3) << "RocmTraceCollector collected " << num_callback_events_
+          << " callback events, " << num_activity_events_
+          << " activity events, and aggregated them into "
+          << aggregated_events_.size() << " events.";
+
+  // device ids for GPUs filled in by roctracer are not zero indexed.
+  // They are offset by number of CPUs on the machine
+  tsl::uint32 min_device_id = INT32_MAX;
+  ;
+  for (auto& event : aggregated_events_) {
+    if (event.device_id < min_device_id) {
+      min_device_id = event.device_id;
+    }
+  }
+
+  for (auto event : aggregated_events_) {
+    event.device_id = event.device_id - min_device_id;
+    if (event.device_id < num_gpus_) {
+      per_device_collector_[event.device_id].AddEvent(event);
+    } else {
+      OnEventsDropped("Invalid device id for an event.", event.correlation_id);
+      DumpRocmTracerEvent(event, 0, 0, ". Dropped!");
+    }
+  }
+
+  activity_ops_events_map_.clear();
+  api_events_map_.clear();
+  auxiliary_api_events_map_.clear();
+}
+
+void RocmTraceCollectorImpl::Export(XSpace* space) {
+  uint64_t end_gputime_ns = get_timestamp();
+  XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName(
+      space, tsl::profiler::kRoctracerApiPlaneName));
+
+  for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
+    std::string name = GpuPlaneName(device_ordinal);
+    XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
+    device_plane.SetId(device_ordinal);
+    // Calculate device capabilities before flushing, so that device
+    // properties are available to the occupancy calculator in export().
+    per_device_collector_[device_ordinal].GetDeviceCapabilities(device_ordinal,
+                                                                &device_plane);
+    per_device_collector_[device_ordinal].Export(
+        start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane,
+        &host_plane);
+    NormalizeTimeStamps(&device_plane, start_walltime_ns_);
+  }
+  NormalizeTimeStamps(&host_plane, start_walltime_ns_);
+}
+
+const std::vector<RocmTracerEvent>
+RocmTraceCollectorImpl::ApiActivityInfoExchange() {
+  /* Different from CUDA, roctracer activity records are not enough to fill a
+    TF event. For most of the activities, we need to enable the corresponding
+    API callsbacks (we call them auxiliary API callbacks) to capture the
+    necessary fields from them using the correlation id. The purpose of this
+    function is to let APIs and activities exchange information to reach a
+    state very similar to TF CUDA and getting ready to dump the event.
+  */
+
+  std::vector<RocmTracerEvent> aggregated_events;
+
+  // Copy info from activity events to API callback events
+  for (auto& api_iter : api_events_map_) {
+    RocmTracerEvent& api_event = api_iter.second;
+    auto activity_event =
+        activity_ops_events_map_.find(api_event.correlation_id);
+
+    if (activity_event == activity_ops_events_map_.end()) {
+      OnEventsDropped(
+          "An event from HIP API discarded."
+          "Could not find the counterpart activity.",
+          api_event.correlation_id);
+      DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
+    } else {
+      api_event.device_id = activity_event->second.front().device_id;
+      api_event.stream_id = activity_event->second.front().stream_id;
+      switch (api_event.type) {
+        case RocmTracerEventType::Kernel:
+        case RocmTracerEventType::Memset:
+        case RocmTracerEventType::MemoryAlloc:
+        case RocmTracerEventType::MemoryFree:
+        case RocmTracerEventType::Synchronization: {
+          aggregated_events.push_back(api_event);
+          break;
+        }
+        case RocmTracerEventType::MemcpyD2H:
+        case RocmTracerEventType::MemcpyH2D:
+        case RocmTracerEventType::MemcpyD2D:
+        case RocmTracerEventType::MemcpyOther: {
+          api_event.memcpy_info.destination =
+              activity_event->second.front().device_id;
+          aggregated_events.push_back(api_event);
+          break;
+        }
+        default:
+          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                          api_event.correlation_id);
+          DumpRocmTracerEvent(api_event, 0, 0, ". Dropped!");
+          LOG(WARNING) << "A ROCm API event type with unimplemented activity "
+                          "merge dropped! "
+                          "Type="
+                       << GetRocmTracerEventTypeName(api_event.type);
+      }
+    }
+  }
+
+  // Make sure for all activity events we have API callback events
+  for (auto& activity_iter : activity_ops_events_map_) {
+    RocmTracerEvent& activity_event = activity_iter.second.front();
+    auto api_event = api_events_map_.find(activity_event.correlation_id);
+
+    if (api_event == api_events_map_.end()) {
+      api_event = auxiliary_api_events_map_.find(activity_event.correlation_id);
+    }
+
+    if (api_event == auxiliary_api_events_map_.end()) {
+      OnEventsDropped(
+          "An event from activity was discarded."
+          "Could not find the counterpart HIP API.",
+          activity_event.correlation_id);
+      DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
+    } else {
+      switch (activity_event.type) {
+        // KERNEL ACTIVITY
+        case RocmTracerEventType::Kernel: {
+          activity_event.name = api_event->second.name;
+          activity_event.kernel_info = api_event->second.kernel_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MEMCPY ACTIVITY
+        case RocmTracerEventType::MemcpyD2H:
+        case RocmTracerEventType::MemcpyH2D:
+        case RocmTracerEventType::MemcpyD2D:
+        case RocmTracerEventType::MemcpyOther: {
+          activity_event.memcpy_info = api_event->second.memcpy_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MEMSET ACTIVITY
+        case RocmTracerEventType::Memset: {
+          activity_event.memset_info = api_event->second.memset_info;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // MALLOC ACTIVITY, FREE ACTIVITY
+        case RocmTracerEventType::MemoryAlloc:
+        case RocmTracerEventType::MemoryFree: {
+          activity_event.device_id = api_event->second.device_id;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        // SYNCHRONIZATION ACTIVITY
+        case RocmTracerEventType::Synchronization: {
+          activity_event.device_id = api_event->second.device_id;
+          aggregated_events.push_back(activity_event);
+          break;
+        }
+        default:
+          OnEventsDropped("Missing API-Activity information exchange. Dropped!",
+                          activity_event.correlation_id);
+          DumpRocmTracerEvent(activity_event, 0, 0, ". Dropped!");
+          LOG(WARNING) << "A ROCm activity event with unimplemented API "
+                          "callback merge dropped! "
+                          "Type="
+                       << GetRocmTracerEventTypeName(activity_event.type);
+          break;
+      }
+    }
+  }
+
+  return aggregated_events;
+}
+
+std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
+    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
+    const uint64_t start_gputime_ns) {
+  return std::make_unique<RocmTraceCollectorImpl>(options, start_walltime_ns,
+                                                  start_gputime_ns);
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
new file mode 100644
index 00000000000000..af8bc26f97aa24
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h
@@ -0,0 +1,227 @@
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+#define XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "tsl/profiler/utils/xplane_builder.h"
+
+namespace xla {
+namespace profiler {
+
+using tsl::profiler::XSpace;
+
+struct MemcpyDetails {
+  // The amount of data copied for memcpy events.
+  size_t num_bytes;
+  // The destination device for peer-2-peer communication (memcpy). The source
+  // device is implicit: it's the current device.
+  uint32_t destination;
+  // Whether or not the memcpy is asynchronous.
+  bool async;
+};
+
+struct MemAllocDetails {
+  // The amount of data requested for cudaMalloc events.
+  uint64_t num_bytes;
+};
+
+struct MemsetDetails {
+  // The number of memory elements getting set
+  size_t num_bytes;
+  // Whether or not the memset is asynchronous.
+  bool async;
+};
+
+struct KernelDetails {
+  // The number of registers used in this kernel.
+  uint32_t registers_per_thread;
+  // The amount of shared memory space used by a thread block.
+  uint32_t static_shared_memory_usage;
+  // The amount of dynamic memory space used by a thread block.
+  uint32_t dynamic_shared_memory_usage;
+  // X-dimension of a thread block.
+  uint32_t block_x;
+  // Y-dimension of a thread block.
+  uint32_t block_y;
+  // Z-dimension of a thread block.
+  uint32_t block_z;
+  // X-dimension of a grid.
+  uint32_t grid_x;
+  // Y-dimension of a grid.
+  uint32_t grid_y;
+  // Z-dimension of a grid.
+  uint32_t grid_z;
+
+  // kernel address. Used for calculating core occupancy
+  void* func_ptr;
+};
+
+inline std::string ToXStat(const KernelDetails& kernel_info,
+                           double occupancy_pct) {
+  return absl::StrCat(
+      "regs:", kernel_info.registers_per_thread,
+      " static_shared:", kernel_info.static_shared_memory_usage,
+      " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
+      " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
+      kernel_info.grid_z, " block:", kernel_info.block_x, ",",
+      kernel_info.block_y, ",", kernel_info.block_z,
+      " occ_pct:", occupancy_pct);
+}
+
+enum class RocmTracerEventType {
+  Unsupported = 0,
+  Kernel,
+  MemcpyH2D,
+  MemcpyD2H,
+  MemcpyD2D,
+  MemcpyP2P,
+  MemcpyOther,
+  MemoryAlloc,
+  MemoryFree,
+  Memset,
+  Synchronization,
+  Generic,
+};
+
+const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
+
+enum class RocmTracerEventSource {
+  Invalid = 0,
+  ApiCallback,
+  Activity,
+};
+
+const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
+
+enum class RocmTracerEventDomain {
+  InvalidDomain = 0,
+  HIP_API,
+  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
+};
+const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
+// RocmTracerSyncTypes forward decleration
+enum class RocmTracerSyncTypes;
+
+struct SynchronizationDetails {
+  RocmTracerSyncTypes sync_type;
+};
+
+struct RocmTracerEvent {
+  static constexpr uint32_t kInvalidDeviceId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidThreadId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint32_t kInvalidCorrelationId =
+      std::numeric_limits<uint32_t>::max();
+  static constexpr uint64_t kInvalidStreamId =
+      std::numeric_limits<uint64_t>::max();
+  RocmTracerEventType type;
+  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
+  RocmTracerEventDomain domain;
+  std::string name;
+  // This points to strings in AnnotationMap, which should outlive the point
+  // where serialization happens.
+  absl::string_view annotation;
+  absl::string_view roctx_range;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = kInvalidDeviceId;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
+  int64_t stream_id = kInvalidStreamId;
+  union {
+    MemcpyDetails memcpy_info;                    // If type == Memcpy*
+    MemsetDetails memset_info;                    // If type == Memset*
+    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
+    KernelDetails kernel_info;                    // If type == Kernel
+    SynchronizationDetails synchronization_info;  // If type == Synchronization
+  };
+};
+
+struct RocmTraceCollectorOptions {
+  // Maximum number of events to collect from callback API; if -1, no limit.
+  // if 0, the callback API is enabled to build a correlation map, but no
+  // events are collected.
+  uint64_t max_callback_api_events;
+  // Maximum number of events to collect from activity API; if -1, no limit.
+  uint64_t max_activity_api_events;
+  // Maximum number of annotation strings that we can accommodate.
+  uint64_t max_annotation_strings;
+  // Number of GPUs involved.
+  uint32_t num_gpus;
+};
+
+class AnnotationMap {
+ public:
+  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
+  void Add(uint32_t correlation_id, const std::string& annotation);
+  absl::string_view LookUp(uint32_t correlation_id);
+
+ private:
+  struct AnnotationMapImpl {
+    // The population/consumption of annotations might happen from multiple
+    // callback/activity api related threads.
+    absl::Mutex mutex;
+    // Annotation tends to be repetitive, use a hash_set to store the strings,
+    // an use the reference to the string in the map.
+    absl::node_hash_set<std::string> annotations;
+    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
+  };
+  const uint64_t max_size_;
+  AnnotationMapImpl map_;
+
+ public:
+  // Disable copy and move.
+  AnnotationMap(const AnnotationMap&) = delete;
+  AnnotationMap& operator=(const AnnotationMap&) = delete;
+};
+
+class RocmTraceCollector {
+ public:
+  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
+      : options_(options), annotation_map_(options.max_annotation_strings) {}
+  virtual ~RocmTraceCollector() {}
+
+  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
+  virtual void OnEventsDropped(const std::string& reason,
+                               uint32_t num_events) = 0;
+  virtual void Flush() = 0;
+  virtual void Export(XSpace* space) = 0;
+
+  AnnotationMap* annotation_map() { return &annotation_map_; }
+
+ protected:
+  RocmTraceCollectorOptions options_;
+
+ private:
+  AnnotationMap annotation_map_;
+
+ public:
+  // Disable copy and move.
+  RocmTraceCollector(const RocmTraceCollector&) = delete;
+  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
+};
+
+std::unique_ptr<RocmTraceCollector> CreateRocmCollector(
+    const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns,
+    const uint64_t start_gputime_ns);
+
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index 2ca9d5a581eb3a..91795b5cf5ec5a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -169,7 +169,15 @@ inline void DumpApiCallbackData(uint32_t domain, uint32_t cbid,
         break;
       case HIP_API_ID_hipStreamSynchronize:
         break;
+      case HIP_API_ID_hipStreamWaitEvent:  // ignore all aux HIP API Events
+      case HIP_API_ID_hipHostFree:
+      case HIP_API_ID_hipHostMalloc:
+      case HIP_API_ID_hipSetDevice:
+        break;
       default:
+        VLOG(3) << "Warning: HIP_API_ID_x is not handled in "
+                   "DumpApiCallbackData, HIP_API_ID="
+                << cbid;
         DCHECK(false);
         break;
     }
@@ -269,51 +277,8 @@ const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) {
   return "";
 }
 
-void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const std::string& message) {
-  std::ostringstream oss;
-  oss << "correlation_id=" << event.correlation_id;
-  oss << ",type=" << GetRocmTracerEventTypeName(event.type);
-  oss << ",source=" << GetRocmTracerEventSourceName(event.source);
-  oss << ",domain=" << GetRocmTracerEventDomainName(event.domain);
-  oss << ",name=" << event.name;
-  oss << ",annotation=" << event.annotation;
-  oss << ",start_time_us="
-      << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000;
-  oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000;
-  oss << ",device_id=" << event.device_id;
-  oss << ",thread_id=" << event.thread_id;
-  oss << ",stream_id=" << event.stream_id;
-
-  switch (event.type) {
-    case RocmTracerEventType::Kernel:
-      break;
-    case RocmTracerEventType::MemcpyD2H:
-    case RocmTracerEventType::MemcpyH2D:
-    case RocmTracerEventType::MemcpyD2D:
-    case RocmTracerEventType::MemcpyP2P:
-      oss << ",num_bytes=" << event.memcpy_info.num_bytes;
-      oss << ",destination=" << event.memcpy_info.destination;
-      oss << ",async=" << event.memcpy_info.async;
-      break;
-    case RocmTracerEventType::MemoryAlloc:
-      oss << ",num_bytes=" << event.memalloc_info.num_bytes;
-      break;
-    case RocmTracerEventType::Synchronization:
-      break;
-    case RocmTracerEventType::Generic:
-      break;
-    default:
-      DCHECK(false);
-      break;
-  }
-  oss << message;
-  VLOG(3) << oss.str();
-}
-
-absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                             const void* cbdata) {
+tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
+                                            const void* cbdata) {
   /* Some APIs such as hipMalloc, implicitly work on th devices set by the
     user using APIs such as hipSetDevice. API callbacks and activity records
     for functions like hipMalloc does not return the device id (CUDA does). To
@@ -325,7 +290,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
 
   // DumpApiCallbackData(domain, cbid, cbdata);
 
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus();
+  if (domain != ACTIVITY_DOMAIN_HIP_API) return tsl::OkStatus();
 
   const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
 
@@ -353,7 +318,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
       } else {
         LOG(WARNING) << "An API exit callback received without API enter "
                         "with same correlation id. Event droped!";
-        return absl::OkStatus();  // This API does not belong to us.
+        return tsl::OkStatus();  // This API does not belong to us.
       }
       exit_time = RocmTracer::GetTimestamp();
     }
@@ -434,7 +399,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
         break;
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
@@ -873,8 +838,8 @@ void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
   collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                  const char* end) {
+tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
+                                                 const char* end) {
   // we do not dump activities in this set in logger
 
   static std::set<activity_op_t> dump_excluded_activities = {
@@ -1021,7 +986,7 @@ absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
             ));
   }
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
@@ -1332,25 +1297,6 @@ void RocmActivityCallbackImpl::AddHipOpsMemsetActivityEvent(
   collector_->AddEvent(std::move(event), false);
 }
 
-void AnnotationMap::Add(uint32_t correlation_id,
-                        const std::string& annotation) {
-  if (annotation.empty()) return;
-  VLOG(3) << "Add annotation: " << " correlation_id=" << correlation_id
-          << ", annotation: " << annotation;
-  absl::MutexLock lock(&map_.mutex);
-  if (map_.annotations.size() < max_size_) {
-    absl::string_view annotation_str =
-        *map_.annotations.insert(annotation).first;
-    map_.correlation_map.emplace(correlation_id, annotation_str);
-  }
-}
-
-absl::string_view AnnotationMap::LookUp(uint32_t correlation_id) {
-  absl::MutexLock lock(&map_.mutex);
-  auto it = map_.correlation_map.find(correlation_id);
-  return it != map_.correlation_map.end() ? it->second : absl::string_view();
-}
-
 /* static */ RocmTracer* RocmTracer::GetRocmTracerSingleton() {
   static auto* singleton = new RocmTracer();
   return singleton;
@@ -1413,15 +1359,15 @@ void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
   tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
 }
 
-absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
+tsl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                           const void* cbdata) {
   if (api_tracing_enabled_)
     TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::EnableApiTracing() {
+  if (api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = true;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1443,11 +1389,11 @@ absl::Status RocmTracer::EnableApiTracing() {
       }
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::DisableApiTracing() {
+  if (!api_tracing_enabled_) return tsl::OkStatus();
   api_tracing_enabled_ = false;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1469,7 +1415,7 @@ absl::Status RocmTracer::DisableApiTracing() {
       }
     }
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 void ActivityCallback(const char* begin, const char* end, void* user_data) {
@@ -1477,8 +1423,8 @@ void ActivityCallback(const char* begin, const char* end, void* user_data) {
   tracer->ActivityCallbackHandler(begin, end).IgnoreError();
 }
 
-absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                 const char* end) {
+tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
+                                                const char* end) {
   if (activity_tracing_enabled_) {
     TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
   } else {
@@ -1503,11 +1449,11 @@ absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
     }
     VLOG(3) << "Dropped Activity Records End";
   }
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::EnableActivityTracing() {
+  if (activity_tracing_enabled_) return tsl::OkStatus();
   activity_tracing_enabled_ = true;
 
   if (!options_->activity_tracing.empty()) {
@@ -1544,11 +1490,11 @@ absl::Status RocmTracer::EnableActivityTracing() {
     }
   }
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
-absl::Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return absl::OkStatus();
+tsl::Status RocmTracer::DisableActivityTracing() {
+  if (!activity_tracing_enabled_) return tsl::OkStatus();
 
   for (auto& iter : options_->activity_tracing) {
     activity_domain_t domain = iter.first;
@@ -1599,7 +1545,7 @@ absl::Status RocmTracer::DisableActivityTracing() {
 
   activity_tracing_enabled_ = false;
 
-  return absl::OkStatus();
+  return tsl::OkStatus();
 }
 
 /*static*/ uint64_t RocmTracer::GetTimestamp() {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index cf04442feaab3f..fd566653be02a2 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,107 +16,21 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 
-#include <cstdint>
-#include <limits>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
-#include "absl/status/status.h"
+#include "absl/types/optional.h"
+#include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
 namespace profiler {
 
-struct MemcpyDetails {
-  // The amount of data copied for memcpy events.
-  size_t num_bytes;
-  // The destination device for peer-2-peer communication (memcpy). The source
-  // device is implicit: it's the current device.
-  uint32_t destination;
-  // Whether or not the memcpy is asynchronous.
-  bool async;
-};
-
-struct MemsetDetails {
-  // The number of memory elements getting set
-  size_t num_bytes;
-  // Whether or not the memset is asynchronous.
-  bool async;
-};
-
-struct MemAllocDetails {
-  // The amount of data requested for cudaMalloc events.
-  uint64_t num_bytes;
-};
-
-struct KernelDetails {
-  // The number of registers used in this kernel.
-  uint32_t registers_per_thread;
-  // The amount of shared memory space used by a thread block.
-  uint32_t static_shared_memory_usage;
-  // The amount of dynamic memory space used by a thread block.
-  uint32_t dynamic_shared_memory_usage;
-  // X-dimension of a thread block.
-  uint32_t block_x;
-  // Y-dimension of a thread block.
-  uint32_t block_y;
-  // Z-dimension of a thread block.
-  uint32_t block_z;
-  // X-dimension of a grid.
-  uint32_t grid_x;
-  // Y-dimension of a grid.
-  uint32_t grid_y;
-  // Z-dimension of a grid.
-  uint32_t grid_z;
-
-  // kernel address. Used for calculating core occupancy
-  void* func_ptr;
-};
-
-// RocmTracerSyncTypes forward decleration
-enum class RocmTracerSyncTypes;
-struct SynchronizationDetails {
-  RocmTracerSyncTypes sync_type;
-};
-
-enum class RocmTracerEventType {
-  Unsupported = 0,
-  Kernel,
-  MemcpyH2D,
-  MemcpyD2H,
-  MemcpyD2D,
-  MemcpyP2P,
-  MemcpyOther,
-  MemoryAlloc,
-  MemoryFree,
-  Memset,
-  Synchronization,
-  Generic,
-};
-
-const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type);
-
-enum class RocmTracerEventSource {
-  Invalid = 0,
-  ApiCallback,
-  Activity,
-};
-
-const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source);
-
-enum class RocmTracerEventDomain {
-  InvalidDomain = 0,
-  HIP_API,
-  HCC_OPS,  // TODO(rocm-profiler): renme this to HIP_OPS
-};
 enum class RocmTracerSyncTypes {
   InvalidSync = 0,
   StreamSynchronize,  // caller thread wait stream to become empty
@@ -124,44 +38,6 @@ enum class RocmTracerSyncTypes {
   StreamWait          // compute stream will wait for event to happen
 };
 
-const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain);
-
-struct RocmTracerEvent {
-  static constexpr uint32_t kInvalidDeviceId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidThreadId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint32_t kInvalidCorrelationId =
-      std::numeric_limits<uint32_t>::max();
-  static constexpr uint64_t kInvalidStreamId =
-      std::numeric_limits<uint64_t>::max();
-  RocmTracerEventType type;
-  RocmTracerEventSource source = RocmTracerEventSource::Invalid;
-  RocmTracerEventDomain domain;
-  std::string name;
-  // This points to strings in AnnotationMap, which should outlive the point
-  // where serialization happens.
-  absl::string_view annotation;
-  absl::string_view roctx_range;
-  uint64_t start_time_ns = 0;
-  uint64_t end_time_ns = 0;
-  uint32_t device_id = kInvalidDeviceId;
-  uint32_t correlation_id = kInvalidCorrelationId;
-  uint32_t thread_id = kInvalidThreadId;
-  int64_t stream_id = kInvalidStreamId;
-  union {
-    MemcpyDetails memcpy_info;                    // If type == Memcpy*
-    MemsetDetails memset_info;                    // If type == Memset*
-    MemAllocDetails memalloc_info;                // If type == MemoryAlloc
-    KernelDetails kernel_info;                    // If type == Kernel
-    SynchronizationDetails synchronization_info;  // If type == Synchronization
-  };
-};
-
-void DumpRocmTracerEvent(const RocmTracerEvent& event,
-                         uint64_t start_walltime_ns, uint64_t start_gputime_ns,
-                         const std::string& message);
-
 struct RocmTracerOptions {
   std::set<uint32_t> api_tracking_set;  // actual api set we want to profile
 
@@ -175,69 +51,6 @@ struct RocmTracerOptions {
       activity_tracing;
 };
 
-struct RocmTraceCollectorOptions {
-  // Maximum number of events to collect from callback API; if -1, no limit.
-  // if 0, the callback API is enabled to build a correlation map, but no
-  // events are collected.
-  uint64_t max_callback_api_events;
-  // Maximum number of events to collect from activity API; if -1, no limit.
-  uint64_t max_activity_api_events;
-  // Maximum number of annotation strings that we can accommodate.
-  uint64_t max_annotation_strings;
-  // Number of GPUs involved.
-  uint32_t num_gpus;
-};
-
-class AnnotationMap {
- public:
-  explicit AnnotationMap(uint64_t max_size) : max_size_(max_size) {}
-  void Add(uint32_t correlation_id, const std::string& annotation);
-  absl::string_view LookUp(uint32_t correlation_id);
-
- private:
-  struct AnnotationMapImpl {
-    // The population/consumption of annotations might happen from multiple
-    // callback/activity api related threads.
-    absl::Mutex mutex;
-    // Annotation tends to be repetitive, use a hash_set to store the strings,
-    // an use the reference to the string in the map.
-    absl::node_hash_set<std::string> annotations;
-    absl::flat_hash_map<uint32_t, absl::string_view> correlation_map;
-  };
-  const uint64_t max_size_;
-  AnnotationMapImpl map_;
-
- public:
-  // Disable copy and move.
-  AnnotationMap(const AnnotationMap&) = delete;
-  AnnotationMap& operator=(const AnnotationMap&) = delete;
-};
-
-class RocmTraceCollector {
- public:
-  explicit RocmTraceCollector(const RocmTraceCollectorOptions& options)
-      : options_(options), annotation_map_(options.max_annotation_strings) {}
-  virtual ~RocmTraceCollector() {}
-
-  virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0;
-  virtual void OnEventsDropped(const std::string& reason,
-                               uint32_t num_events) = 0;
-  virtual void Flush() = 0;
-
-  AnnotationMap* annotation_map() { return &annotation_map_; }
-
- protected:
-  RocmTraceCollectorOptions options_;
-
- private:
-  AnnotationMap annotation_map_;
-
- public:
-  // Disable copy and move.
-  RocmTraceCollector(const RocmTraceCollector&) = delete;
-  RocmTraceCollector& operator=(const RocmTraceCollector&) = delete;
-};
-
 class RocmTracer;
 
 class RocmApiCallbackImpl {
@@ -246,7 +59,7 @@ class RocmApiCallbackImpl {
                       RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+  tsl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
 
  private:
   void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
@@ -284,7 +97,7 @@ class RocmActivityCallbackImpl {
                            RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  absl::Status operator()(const char* begin, const char* end);
+  tsl::Status operator()(const char* begin, const char* end);
 
  private:
   void AddHipKernelActivityEvent(const roctracer_record_t* record);
@@ -314,9 +127,9 @@ class RocmTracer {
   void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
   void Disable();
 
-  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                  const void* cbdata);
-  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
+  tsl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                 const void* cbdata);
+  tsl::Status ActivityCallbackHandler(const char* begin, const char* end);
 
   static uint64_t GetTimestamp();
   static int NumGpus();
@@ -340,11 +153,11 @@ class RocmTracer {
   explicit RocmTracer() : num_gpus_(NumGpus()) {}
 
  private:
-  absl::Status EnableApiTracing();
-  absl::Status DisableApiTracing();
+  tsl::Status EnableApiTracing();
+  tsl::Status DisableApiTracing();
 
-  absl::Status EnableActivityTracing();
-  absl::Status DisableActivityTracing();
+  tsl::Status EnableActivityTracing();
+  tsl::Status DisableActivityTracing();
 
   int num_gpus_;
   std::optional<RocmTracerOptions> options_;
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 8c7768cf3dbb6c..990df18380b508 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -1,9 +1,9 @@
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 6c3b7790d88f21..d55782e865802e 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #   XLA client libraries.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -293,8 +293,10 @@ xla_cc_test(
         ":value_inference",
         ":xla_builder",
         ":xla_computation",
+        "//xla:comparison_util",
         "//xla:debug_options_flags",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
@@ -305,6 +307,8 @@ xla_cc_test(
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/client/executable_build_options.cc b/third_party/xla/xla/client/executable_build_options.cc
index 3a583e78984eec..64be6188908e5b 100644
--- a/third_party/xla/xla/client/executable_build_options.cc
+++ b/third_party/xla/xla/client/executable_build_options.cc
@@ -110,14 +110,14 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_use_auto_spmd_partitioning(
 ExecutableBuildOptions&
 ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_shape(
     std::vector<int64_t> mesh_shape) {
-  auto_spmd_partitioning_mesh_shape_ = mesh_shape;
+  auto_spmd_partitioning_mesh_shape_ = std::move(mesh_shape);
   return *this;
 }
 
 ExecutableBuildOptions&
 ExecutableBuildOptions::set_auto_spmd_partitioning_mesh_ids(
     std::vector<int64_t> mesh_ids) {
-  auto_spmd_partitioning_mesh_ids_ = mesh_ids;
+  auto_spmd_partitioning_mesh_ids_ = std::move(mesh_ids);
   return *this;
 }
 
diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index 65b29dcbda7cd4..4d1f90698de0c1 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -1,9 +1,14 @@
 # Common computation builders for XLA.
 
-load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
+
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -554,6 +559,7 @@ xla_test(
     real_hardware_only = True,
     shard_count = 5,
     tags = ["optonly"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1",]),
     deps = [
         ":arithmetic",
         ":constants",
diff --git a/third_party/xla/xla/client/lib/approx_topk_shape.cc b/third_party/xla/xla/client/lib/approx_topk_shape.cc
index 64bd2d35e5b4b3..aec7b7105e9d3a 100644
--- a/third_party/xla/xla/client/lib/approx_topk_shape.cc
+++ b/third_party/xla/xla/client/lib/approx_topk_shape.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstdint>
 #include <utility>
 
 #include "xla/util.h"
@@ -97,7 +98,7 @@ absl::StatusOr<std::pair<int64_t, int64_t>> ApproxTopKReductionOutputSize(
           static_cast<uint64_t>((1.0 - top_k) /
                                 std::log(static_cast<double>(recall_target))),
           tpu_tiling),
-      input_size);
+      logical_input_size);
   uint32_t log2_reduction = log2_floor(logical_input_size / m);
   if (log2_reduction == 0) {
     return std::pair<int64_t, int64_t>(input_size, 0);
diff --git a/third_party/xla/xla/client/lib/qr_test.cc b/third_party/xla/xla/client/lib/qr_test.cc
index 1da9cab60a02d0..a21932b3e797e3 100644
--- a/third_party/xla/xla/client/lib/qr_test.cc
+++ b/third_party/xla/xla/client/lib/qr_test.cc
@@ -145,4 +145,39 @@ XLA_TEST_F(QrTest, SubnormalComplex) {
                                     xla::ErrorSpec(1e-4, 1e-4));
 }
 
+XLA_TEST_F(QrTest, DuplicateHouseholderExpansion) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a0_vals({
+      {0, 1, 1},
+      {1, 0, 1},
+      {1, 1, 0},
+  });
+  xla::Array2D<float> a1_vals({
+      {1, 0},
+      {0, 1},
+      {1, 0},
+  });
+
+  // Verifies that different computations are created to generate HouseHolder
+  // transformations with identical QR shapes, but different tau shapes.
+  // The first QR decomposition should generate a ([3,3], [3]) computation,
+  // the second should generate a ([3,3], [2]) computation. Mismatch will result
+  // in compilation failure.
+
+  xla::XlaOp a0, q0, r0;
+  auto a0_data = CreateR2Parameter<float>(a0_vals, 0, "a0", &builder, &a0);
+  xla::QrExplicit(a0, /*full_matrices=*/true, q0, r0);
+
+  xla::XlaOp a1, q1, r1;
+  auto a1_data = CreateR2Parameter<float>(a1_vals, 1, "a1", &builder, &a1);
+  xla::QrExplicit(a1, /*full_matrices=*/true, q1, r1);
+
+  // Verifies that the decomposition composes back to the original matrix.
+  xla::BatchDot(q1, r1, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR2<float>(&builder, a1_vals, {a0_data.get(), a1_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
 }  // namespace
diff --git a/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc b/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
index 7be635f2a9a796..f47f4830d2a1a7 100644
--- a/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
+++ b/third_party/xla/xla/client/lib/self_adjoint_eig_test.cc
@@ -277,6 +277,10 @@ class RandomEighTest : public ClientLibraryTestBase,
                        public ::testing::WithParamInterface<EighTestCase> {};
 
 XLA_TEST_P(RandomEighTest, Random) {
+#if TENSORFLOW_USE_ROCM
+  GTEST_SKIP() << "RandomEighTest.Random is currently not supported on ROCm.";
+#endif  // TENSORFLOW_USE_ROCM
+
   XlaBuilder builder(TestName());
   int64_t size = GetParam();
   Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 623d654ca7824b..033361e897b676 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -32,8 +32,8 @@ using xla::source_map_util::InvalidParameterArgument;
 namespace xla {
 
 namespace {
-StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
-                                                Backend* backend) {
+absl::StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
+                                                      Backend* backend) {
   if (device_ordinal < 0) {
     device_ordinal = backend->default_device_ordinal();
   }
@@ -115,7 +115,7 @@ Status LocalExecutable::ValidateExecutionOptions(
   return OkStatus();
 }
 
-StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
 LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
                            ExecutableRunOptions run_options) {
   const ComputationLayout& computation_layout =
@@ -171,7 +171,7 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
+absl::StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
@@ -185,7 +185,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
       });
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::Run(
+absl::StatusOr<ExecutionOutput> LocalExecutable::Run(
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
@@ -239,7 +239,7 @@ static void DumpOutputsAndSaveSnapshot(const Backend* backend,
       });
 }
 
-StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
+absl::StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
@@ -279,7 +279,7 @@ static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
   return result;
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+absl::StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     absl::Span<Shape const* const> argument_host_shapes,
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   if (argument_host_shapes.size() != arguments.size()) {
@@ -321,7 +321,7 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
   return std::move(outputs);
 }
 
-StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
+absl::StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
   std::vector<const Shape*> argument_shapes;
   argument_shapes.reserve(arguments.size());
@@ -355,7 +355,7 @@ Backend* LocalClient::mutable_backend() {
   return local_service_->mutable_backend();
 }
 
-static StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
+static absl::StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
     const ExecutableBuildOptions& options, int default_device_ordinal) {
   ExecutableBuildOptions updated_options = options;
   if (options.device_ordinal() == -1) {
@@ -383,10 +383,10 @@ static StatusOr<ExecutableBuildOptions> UpdateBuildOptions(
   return updated_options;
 }
 
-StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> LocalClient::Compile(
-    const XlaComputation& computation,
-    const absl::Span<const Shape* const> argument_layouts,
-    const ExecutableBuildOptions& options) {
+absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>>
+LocalClient::Compile(const XlaComputation& computation,
+                     const absl::Span<const Shape* const> argument_layouts,
+                     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
                       UpdateBuildOptions(options, default_device_ordinal()));
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
@@ -405,7 +405,7 @@ StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> LocalClient::Compile(
   return std::move(local_executables);
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 LocalClient::CompileAheadOfTime(
     const XlaComputation& computation,
     const absl::Span<const Shape* const> argument_layouts,
@@ -420,7 +420,7 @@ LocalClient::CompileAheadOfTime(
   return std::move(aot_results);
 }
 
-StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
+absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
     const std::string& serialized_aot_result,
     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
@@ -442,7 +442,7 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
                                            updated_options);
 }
 
-StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
+absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
     se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
@@ -458,7 +458,7 @@ StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
   return std::move(scoped_buffer);
 }
 
-StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
+absl::StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
     const ShapedBuffer& shaped_buffer) {
   TF_ASSIGN_OR_RETURN(auto stream, mutable_backend()->BorrowStream(
                                        shaped_buffer.device_ordinal()));
@@ -466,7 +466,7 @@ StatusOr<Literal> LocalClient::ShapedBufferToLiteral(
                                                                  shaped_buffer);
 }
 
-StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
+absl::StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
     const GlobalDataHandle& data, int replica_number) {
   return local_service_->GlobalDataToShapedBuffer(data, replica_number);
 }
@@ -487,11 +487,12 @@ Status LocalClient::TransferFromOutfeedLocal(int device_ordinal,
                                                                   literal);
 }
 
-StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
+absl::StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(
+    int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
-StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+absl::StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
     const ::xla::BorrowingLiteral& literal, int device_ordinal) {
   const ::xla::Shape& shape = literal.shape();
 
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index d3b0af63d254cd..d6e382192cab86 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -49,25 +49,25 @@ class LocalExecutable {
 
   // Run the compiled computation with the given arguments and options and
   // return the result.
-  StatusOr<ScopedShapedBuffer> Run(
+  absl::StatusOr<ScopedShapedBuffer> Run(
       absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to Run(), but allows for donating argument buffers to the
   // executable.
-  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
-                                ExecutableRunOptions run_options);
+  absl::StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                      ExecutableRunOptions run_options);
 
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
-  StatusOr<ScopedShapedBuffer> RunAsync(
+  absl::StatusOr<ScopedShapedBuffer> RunAsync(
       absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to RunAsync(), but allows for donating argument buffers to the
   // executable.
-  StatusOr<ExecutionOutput> RunAsync(std::vector<ExecutionInput> arguments,
-                                     ExecutableRunOptions run_options);
+  absl::StatusOr<ExecutionOutput> RunAsync(
+      std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
@@ -76,7 +76,7 @@ class LocalExecutable {
   Executable* executable() const { return executable_.get(); }
 
  private:
-  StatusOr<ExecutionOutput> RunAsync(
+  absl::StatusOr<ExecutionOutput> RunAsync(
       absl::Span<Shape const* const> argument_host_shapes,
       std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options);
 
@@ -89,11 +89,12 @@ class LocalExecutable {
                                   const Backend& backend);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
-  StatusOr<Literal> LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer);
+  absl::StatusOr<Literal> LiteralFromShapedBuffer(
+      const ShapedBuffer& shaped_buffer);
 
-  StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>> RunHelper(
-      absl::Span<const Shape* const> argument_shapes,
-      ExecutableRunOptions run_options);
+  absl::StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>>
+  RunHelper(absl::Span<const Shape* const> argument_shapes,
+            ExecutableRunOptions run_options);
 
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
@@ -142,7 +143,7 @@ class LocalClient : public Client {
   //
   // The given ExecutableBuildOptions overrides any values from XLA_FLAGS
   // environment variable.
-  StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
+  absl::StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
       const XlaComputation& computation,
       absl::Span<const Shape* const> argument_layouts,
       const ExecutableBuildOptions& options);
@@ -150,14 +151,14 @@ class LocalClient : public Client {
   // Same as Compile() above, but return AotCompilationResult objects (instead
   // of LocalExecutable objects), which can be persisted to later load
   // LocalExecutable(s) using the Load() method below.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(const XlaComputation& computation,
                      absl::Span<const Shape* const> argument_layouts,
                      const ExecutableBuildOptions& options);
 
   // Return a LocalExecutable object loaded from a serialized
   // AotCompilationResult.
-  StatusOr<std::unique_ptr<LocalExecutable>> Load(
+  absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
       const std::string& serialized_aot_result,
       const ExecutableBuildOptions& options);
 
@@ -165,21 +166,22 @@ class LocalClient : public Client {
   // ScopedShapedBuffer. If non-null the given memory allocator is used for
   // device memory allocation. If null, the default memory allocator for the
   // device is used.
-  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
+  absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
       se::DeviceMemoryAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
-  StatusOr<TransferToServerResponse> TransferToLocalServer(
+  absl::StatusOr<TransferToServerResponse> TransferToLocalServer(
       const ::xla::BorrowingLiteral& literal, int device_ordinal);
 
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
-  StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
+  absl::StatusOr<Literal> ShapedBufferToLiteral(
+      const ShapedBuffer& shaped_buffer);
 
   // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
   // as long as the handle is valid.
-  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+  absl::StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
   // Transfer the given literal to the infeed queue of the given device.
@@ -201,7 +203,7 @@ class LocalClient : public Client {
   // This returns an error if there is not a one-to-one correspondence of
   // replicas to device ordinals, but is useful as a short term mechanism for
   // the "easy" case where a single replica is a single device.
-  StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
+  absl::StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
   // Returns the platform that the underlying service targets.
   se::Platform* platform() const;
diff --git a/third_party/xla/xla/client/value_inference.cc b/third_party/xla/xla/client/value_inference.cc
index 8d09bc57b5759b..b4bb8af37c4ea8 100644
--- a/third_party/xla/xla/client/value_inference.cc
+++ b/third_party/xla/xla/client/value_inference.cc
@@ -148,7 +148,7 @@ struct HloProtoEvaluator {
     return *this;
   }
 
-  StatusOr<Literal> Evaluate() {
+  absl::StatusOr<Literal> Evaluate() {
     // Evaluate the instruction by swapping it's operands with constant
     // instructions with given literals.
     HloComputation::Builder builder("EmptyComputation");
@@ -286,11 +286,11 @@ struct PostorderDFSDep {
 
 // This function represents the logic to visit a node once its dependencies
 // (operands) are all resolved.
-using Visit = std::function<StatusOr<Literal>(absl::Span<Literal>)>;
+using Visit = std::function<absl::StatusOr<Literal>(absl::Span<Literal>)>;
 // Convenient specializations of Visit function for different operands.
-using Visit0D = std::function<StatusOr<Literal>()>;
-using Visit1D = std::function<StatusOr<Literal>(Literal)>;
-using Visit2D = std::function<StatusOr<Literal>(Literal, Literal)>;
+using Visit0D = std::function<absl::StatusOr<Literal>()>;
+using Visit1D = std::function<absl::StatusOr<Literal>(Literal)>;
+using Visit2D = std::function<absl::StatusOr<Literal>(Literal, Literal)>;
 
 // A postorder dfs node can be visited once its dependency requests are all
 // fulfilled.
@@ -332,7 +332,7 @@ struct [[nodiscard]] PostorderDFSNode {
 
 // Convert an interger handle to HloInstructionProto.
 using HandleToInstruction =
-    std::function<StatusOr<const HloInstructionProto*>(int64_t)>;
+    std::function<absl::StatusOr<const HloInstructionProto*>(int64_t)>;
 using HandleToComputation = std::function<const HloComputationProto*(int64_t)>;
 
 struct PostorderDFSVisitor {
@@ -343,20 +343,20 @@ struct PostorderDFSVisitor {
         handle_to_instruction(handle_to_instruction),
         handle_to_computation(handle_to_computation) {}
 
-  StatusOr<PostorderDFSNode> AnalyzeUpperBound(int64_t handle,
-                                               InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeLowerBound(int64_t handle,
-                                               InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeIsDynamic(int64_t handle,
-                                              PostorderDFSNodeType type,
-                                              InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeConstant(int64_t handle,
-                                             InferenceContext context);
-  StatusOr<PostorderDFSNode> AnalyzeConstantValueFallback(
+  absl::StatusOr<PostorderDFSNode> AnalyzeUpperBound(int64_t handle,
+                                                     InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeLowerBound(int64_t handle,
+                                                     InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeIsDynamic(int64_t handle,
+                                                    PostorderDFSNodeType type,
+                                                    InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeConstant(int64_t handle,
+                                                   InferenceContext context);
+  absl::StatusOr<PostorderDFSNode> AnalyzeConstantValueFallback(
       int64_t handle, PostorderDFSNodeType type, InferenceContext context);
 
-  StatusOr<Literal> PostOrderDFSVisit(int64_t handle,
-                                      PostorderDFSNodeType type);
+  absl::StatusOr<Literal> PostOrderDFSVisit(int64_t handle,
+                                            PostorderDFSNodeType type);
 
   // Returns true if a value represented by `handle` is an integeral type or
   // a floating pointer type that just got converted from an integral type.
@@ -469,8 +469,10 @@ PostorderDFSNode CreateAllDynamicResult(const Shape& shape,
 }  // namespace
 
 // Analyze a tensor's constant value, upper-bound value or lower-bound value.
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
-    int64_t handle, PostorderDFSNodeType type, InferenceContext context) {
+absl::StatusOr<PostorderDFSNode>
+PostorderDFSVisitor::AnalyzeConstantValueFallback(int64_t handle,
+                                                  PostorderDFSNodeType type,
+                                                  InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
@@ -534,7 +536,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
       call_context.caller_operand_handles.push_back(call_proto->operand_ids(0));
       node.AddDependency(called_root, PostorderDFSNodeType::kConstantValue,
                          call_context, "callee's root instruction");
-      return node.AddVisit([](Literal operand) -> StatusOr<Literal> {
+      return node.AddVisit([](Literal operand) -> absl::StatusOr<Literal> {
         // Forward result of callee's root to caller.
         return std::move(operand);
       });
@@ -565,7 +567,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
                            branch_context);
       }
       return node.AddVisit(
-          [](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             int64_t pred_is_dynamic = operands[1].Get<bool>({});
             if (pred_is_dynamic) {
               // If predicate is dynamic, return the value of the first branch
@@ -606,7 +608,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
           [root, computation_proto, context,
-           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
@@ -638,7 +640,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -657,7 +659,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       const HloInstructionProto* operand_proto =
           handle_to_instruction(operand_handle).value();
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension]() -> StatusOr<Literal> {
+          [operand_proto, dimension]() -> absl::StatusOr<Literal> {
             return LiteralUtil::CreateR0<int32_t>(
                 operand_proto->shape().dimensions(dimension));
           });
@@ -671,7 +673,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit([this](Literal lower_bound,
-                           Literal upper_bound) -> StatusOr<Literal> {
+                           Literal upper_bound) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -701,7 +703,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       }
 
       return dfs.AddVisit(
-          [root, context](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root,
+           context](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             std::vector<Literal> results;
             results.reserve(operands.size());
             // Conservatively set each element of the tensor to the max value.
@@ -724,7 +727,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantLowerBound, context)
-          .AddVisit([this](Literal lower_bound) -> StatusOr<Literal> {
+          .AddVisit([this](Literal lower_bound) -> absl::StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         lower_bound);
           });
@@ -739,7 +742,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
                          PostorderDFSNodeType::kConstantLowerBound, context)
           .AddVisit([root, opcode, this](
                         Literal upper_bound,
-                        Literal lower_bound) -> StatusOr<Literal> {
+                        Literal lower_bound) -> absl::StatusOr<Literal> {
             if (opcode == HloOpcode::kDivide &&
                 this->IsValueEffectiveInteger(root->operand_ids(1))) {
               // Because in many cases the lower bound of a value is
@@ -771,7 +774,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
     }
     case HloOpcode::kCustomCall: {
       if (root->custom_call_target() == "SetBound") {
-        return PostorderDFSNode().AddVisit([root]() -> StatusOr<Literal> {
+        return PostorderDFSNode().AddVisit([root]() -> absl::StatusOr<Literal> {
           if (root->literal().shape().element_type() == TUPLE) {
             // First literal of SetBound contains bounds, second literal
             // contains dynamism indicators.
@@ -808,7 +811,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -826,7 +829,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [dimension, operand_proto]() -> StatusOr<Literal> {
+          [dimension, operand_proto]() -> absl::StatusOr<Literal> {
             if (operand_proto->shape().is_dynamic_dimension(dimension)) {
               return LiteralUtil::CreateR0<int32_t>(0);
             } else {
@@ -844,7 +847,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit([this](Literal lower_bound,
-                           Literal upper_bound) -> StatusOr<Literal> {
+                           Literal upper_bound) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -860,7 +863,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([this](Literal upper_bound) -> StatusOr<Literal> {
+          .AddVisit([this](Literal upper_bound) -> absl::StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         upper_bound);
           });
@@ -874,7 +877,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit(
-              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+              [root,
+               this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
                 return std::make_unique<HloProtoEvaluator>(evaluator, *root)
                     ->WithOperands(operands)
                     .Evaluate();
@@ -898,7 +902,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
     int64_t handle, InferenceContext context) {
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
                       handle_to_instruction(handle));
@@ -916,7 +920,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension, root]() -> StatusOr<Literal> {
+          [operand_proto, dimension, root]() -> absl::StatusOr<Literal> {
             if (operand_proto->shape().is_dynamic_dimension(dimension)) {
               // The value is dynamic, we return garbage data here and mask them
               // out later.
@@ -939,7 +943,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
                              context);
       }
       return result.AddVisit(
-          [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root,
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             return std::make_unique<HloProtoEvaluator>(evaluator, *root)
                 ->WithOperands(operands)
                 .Evaluate();
@@ -952,8 +957,9 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
         return PostorderDFSNode()
             .AddDependency(root->operand_ids(0),
                            PostorderDFSNodeType::kConstantValue, context)
-            .AddVisit(
-                [](Literal operand) -> StatusOr<Literal> { return operand; });
+            .AddVisit([](Literal operand) -> absl::StatusOr<Literal> {
+              return operand;
+            });
       } else if (root->custom_call_target() == "Sharding") {
         return PostorderDFSNode()
             .AddDependency(root->operand_ids(0),
@@ -981,7 +987,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
           [root, context, computation_proto,
-           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+           this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
@@ -998,7 +1004,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
   }
 }
 
-StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
+absl::StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     int64_t handle, PostorderDFSNodeType type, InferenceContext context) {
   TF_RETURN_IF_ERROR(handle_to_instruction(handle).status());
   // Invariant check.
@@ -1028,7 +1034,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
                           handle_to_instruction(operand_handle));
       return PostorderDFSNode().AddVisit(
-          [operand_proto, dimension, type]() -> StatusOr<Literal> {
+          [operand_proto, dimension, type]() -> absl::StatusOr<Literal> {
             if (type == PostorderDFSNodeType::kBoundIsDynamic) {
               // The bound of dynamic dimension is not dynamic.
               return LiteralUtil::CreateR0<bool>(false);
@@ -1048,7 +1054,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       }
 
       return dfs.AddVisit([root, context, type](absl::Span<Literal> operands)
-                              -> StatusOr<Literal> {
+                              -> absl::StatusOr<Literal> {
         bool all_operands_values_static = true;
         for (int64_t i = 0; i < operands.size(); ++i) {
           all_operands_values_static &= operands[i].IsAll(0);
@@ -1205,10 +1211,11 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           call_proto->operand_ids(0));
       node.AddDependency(call_root, PostorderDFSNodeType::kValueIsDynamic,
                          branch_context, "callee's root instruction");
-      return node.AddVisit([context](Literal operand) -> StatusOr<Literal> {
-        // Forward result of callee's root to caller.
-        return operand;
-      });
+      return node.AddVisit(
+          [context](Literal operand) -> absl::StatusOr<Literal> {
+            // Forward result of callee's root to caller.
+            return operand;
+          });
     }
     case HloOpcode::kConditional: {
       auto node = PostorderDFSNode();
@@ -1246,7 +1253,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       // 2*i + 1: Branch value is dynamic.
       return node.AddVisit([root, branch_size,
                             context](absl::Span<Literal> operands)
-                               -> StatusOr<Literal> {
+                               -> absl::StatusOr<Literal> {
         int64_t pred_is_dynamic = operands[1].Get<bool>({});
         auto result = CreatePredLiteral(
             true,
@@ -1386,7 +1393,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           .AddDependency(root->operand_ids(1), type, context)
           // rhs dependency.
           .AddDependency(root->operand_ids(2), type, context)
-          .AddVisit([root](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          .AddVisit([root](absl::Span<Literal> operands)
+                        -> absl::StatusOr<Literal> {
             OptionalLiteral optional_selector_literal(std::move(operands[0]),
                                                       std::move(operands[1]));
             Literal lhs = std::move(operands[2]);
@@ -1423,7 +1431,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kValueIsDynamic, context)
           .AddVisit(
-              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+              [root,
+               this](absl::Span<Literal> operands) -> absl::StatusOr<Literal> {
                 OptionalLiteral optional_selector_literal(
                     std::move(operands[1]), std::move(operands[2]));
 
@@ -1444,7 +1453,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     }
     case HloOpcode::kCustomCall: {
       if (root->custom_call_target() == "SetBound") {
-        return PostorderDFSNode().AddVisit([type, root]() -> StatusOr<Literal> {
+        return PostorderDFSNode().AddVisit([type,
+                                            root]() -> absl::StatusOr<Literal> {
           if (type == PostorderDFSNodeType::kBoundIsDynamic) {
             return CreatePredLiteral(false, Shape(root->shape()));
           } else {
@@ -1476,8 +1486,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kWhile: {
-      return PostorderDFSNode().AddVisit([root,
-                                          context]() -> StatusOr<Literal> {
+      return PostorderDFSNode().AddVisit([root, context]()
+                                             -> absl::StatusOr<Literal> {
         return CreatePredLiteral(
             true,
             ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
@@ -1485,8 +1495,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
       break;
     }
     default:
-      return PostorderDFSNode().AddVisit([root,
-                                          context]() -> StatusOr<Literal> {
+      return PostorderDFSNode().AddVisit([root, context]()
+                                             -> absl::StatusOr<Literal> {
         return CreatePredLiteral(
             true,
             ShapeUtil::GetSubshape(Shape(root->shape()), context.shape_index));
@@ -1494,7 +1504,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
   }
 }
 
-StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
+absl::StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
     int64_t handle, PostorderDFSNodeType type) {
   enum VisitState {
     kUnvisited = 0,
@@ -1609,7 +1619,7 @@ StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
   return evaluated[root.GetCacheKey()].Clone();
 }
 
-StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
+absl::StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
   PostorderDFSVisitor visitor(
       evaluator_,
       [&](int64_t handle) {
@@ -1622,7 +1632,8 @@ StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
   return result;
 }
 
-StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(int64_t handle) {
+absl::StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(
+    int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto inst, builder_->LookUpInstructionByHandle(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(inst->opcode()));
   // For now, only handle kGetDimensionSize as that's the most duplicated one.
@@ -1653,7 +1664,7 @@ StatusOr<std::optional<int64_t>> ValueInference::CseOpHandle(int64_t handle) {
   return {std::nullopt};
 }
 
-StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
+absl::StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto cse_handle, CseOpHandle(handle));
   if (cse_handle) {
     // Use the CSE'd handle instead.
@@ -1769,7 +1780,7 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   }
 }
 
-StatusOr<OptionalLiteral> ValueInference::AnalyzeConstant(
+absl::StatusOr<OptionalLiteral> ValueInference::AnalyzeConstant(
     XlaOp op, ValueInferenceMode mode) {
   TF_RETURN_IF_ERROR(builder_->LookUpInstructionByHandle(op.handle()).status());
   PostorderDFSVisitor visitor(
diff --git a/third_party/xla/xla/client/value_inference.h b/third_party/xla/xla/client/value_inference.h
index 9108077204561c..6f1685f1a42e0a 100644
--- a/third_party/xla/xla/client/value_inference.h
+++ b/third_party/xla/xla/client/value_inference.h
@@ -85,10 +85,11 @@ class ValueInference {
   explicit ValueInference(XlaBuilder* builder) : builder_(builder) {
     CHECK(builder_);
   }
-  StatusOr<Literal> AnalyzeIsDynamic(XlaOp op);
+  absl::StatusOr<Literal> AnalyzeIsDynamic(XlaOp op);
   // Returns an OptionalLiteral. Each individual value of the literal is
   // the concrete constant value if it can be inferred, otherwise a nullopt.
-  StatusOr<OptionalLiteral> AnalyzeConstant(XlaOp op, ValueInferenceMode mode);
+  absl::StatusOr<OptionalLiteral> AnalyzeConstant(XlaOp op,
+                                                  ValueInferenceMode mode);
 
   // Returns underlying xla builder.
   XlaBuilder* builder() { return builder_; }
@@ -97,11 +98,11 @@ class ValueInference {
   // Given an op handle, returns a simplified version of the handle inside a
   // int64_t Literal. If the a -1 value for the handle means invalid
   // simplification and the result shouldn't be used.
-  StatusOr<Literal> SimplifyOp(int64_t handle);
+  absl::StatusOr<Literal> SimplifyOp(int64_t handle);
 
   // Perform CSE on a given handle, and return an equivalent handle if seen
   // before. Otherwise, returns nullopt.
-  StatusOr<std::optional<int64_t>> CseOpHandle(int64_t handle);
+  absl::StatusOr<std::optional<int64_t>> CseOpHandle(int64_t handle);
   XlaBuilder* builder_;
   HloEvaluator evaluator_;
   // A map from instruction_hash to handle that helps perform CSE.
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index 57f0529b5877a1..f8d42040b8019b 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -118,7 +118,7 @@ namespace internal {
 
 XlaOp XlaBuilderFriend::BuildAddDependency(XlaBuilder* builder, XlaOp operand,
                                            XlaOp token, const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAddDependency,
@@ -131,7 +131,7 @@ XlaOp XlaBuilderFriend::BuildFusion(
     absl::string_view fusion_kind, const XlaComputation& fused_computation,
     absl::Span<const std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>
         output_operand_aliasing) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_fusion_kind(std::string(fusion_kind));
     if (!output_operand_aliasing.empty()) {
@@ -161,7 +161,7 @@ std::pair<XlaOp, int64_t> XlaBuilderFriend::BuildAsyncStart(
     std::string execution_thread, const XlaComputation& called_computation,
     const Shape& shape) {
   int64_t called_computation_id;
-  auto start_op = builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  auto start_op = builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -178,7 +178,7 @@ XlaOp XlaBuilderFriend::BuildAsyncUpdate(XlaBuilder* builder,
                                          std::string execution_thread,
                                          int64_t called_computation,
                                          const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -192,7 +192,7 @@ XlaOp XlaBuilderFriend::BuildAsyncDone(XlaBuilder* builder, const XlaOp operand,
                                        std::string execution_thread,
                                        int64_t called_computation,
                                        const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     instr.set_async_execution_thread(execution_thread);
@@ -216,7 +216,7 @@ XlaOp XlaBuilderFriend::BuildAllGatherStart(
 XlaOp XlaBuilderFriend::BuildAllGatherDone(XlaBuilder* builder,
                                            const XlaOp operand,
                                            const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAllGatherDone,
@@ -238,7 +238,7 @@ XlaOp XlaBuilderFriend::BuildAllReduceStart(
 XlaOp XlaBuilderFriend::BuildAllReduceDone(XlaBuilder* builder,
                                            const XlaOp operand,
                                            const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kAllReduceDone,
@@ -249,7 +249,7 @@ XlaOp XlaBuilderFriend::BuildAllReduceDone(XlaBuilder* builder,
 XlaOp XlaBuilderFriend::BuildCopyStart(
     XlaBuilder* builder, const XlaOp operand,
     std::optional<int> cross_program_prefetch_index) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (cross_program_prefetch_index) {
       instr.set_cross_program_prefetch_index(*cross_program_prefetch_index);
@@ -269,7 +269,7 @@ XlaOp XlaBuilderFriend::BuildCopyStart(
 
 XlaOp XlaBuilderFriend::BuildCopyDone(XlaBuilder* builder, const XlaOp operand,
                                       const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kCopyDone,
@@ -288,7 +288,7 @@ XlaOp XlaBuilderFriend::BuildCollectivePermuteStart(
 XlaOp XlaBuilderFriend::BuildCollectivePermuteDone(XlaBuilder* builder,
                                                    const XlaOp operand,
                                                    const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(
@@ -298,7 +298,7 @@ XlaOp XlaBuilderFriend::BuildCollectivePermuteDone(XlaBuilder* builder,
 
 XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
                                      const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kBitcast,
@@ -309,7 +309,7 @@ XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
 XlaOp XlaBuilderFriend::BuildDomain(XlaBuilder* builder, XlaOp operand,
                                     const OpSharding entry,
                                     const OpSharding exit, const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_domain_entry_sharding() = entry;
     *instr.mutable_domain_exit_sharding() = exit;
@@ -321,7 +321,7 @@ XlaOp XlaBuilderFriend::BuildDomain(XlaBuilder* builder, XlaOp operand,
 
 XlaOp XlaBuilderFriend::BuildPartitionId(XlaBuilder* builder,
                                          const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return builder->AddInstruction(std::move(instr), HloOpcode::kPartitionId);
@@ -331,7 +331,7 @@ XlaOp XlaBuilderFriend::BuildPartitionId(XlaBuilder* builder,
 XlaOp XlaBuilderFriend::BuildSend(XlaBuilder* builder, XlaOp operand,
                                   XlaOp token, const ChannelHandle& handle,
                                   bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto send_instr;
     TF_ASSIGN_OR_RETURN(const Shape* shape, builder->GetShapePtr(operand));
     // Send instruction produces a tuple of {aliased operand, U32 context,
@@ -350,7 +350,7 @@ XlaOp XlaBuilderFriend::BuildSend(XlaBuilder* builder, XlaOp operand,
 XlaOp XlaBuilderFriend::BuildSendDone(XlaBuilder* builder, XlaOp operand,
                                       const ChannelHandle& handle,
                                       bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto send_done_instr;
     *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
@@ -364,7 +364,7 @@ XlaOp XlaBuilderFriend::BuildRecv(XlaBuilder* builder, XlaOp token,
                                   const Shape& shape,
                                   const ChannelHandle& handle,
                                   bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
@@ -383,7 +383,7 @@ XlaOp XlaBuilderFriend::BuildRecvDone(XlaBuilder* builder, XlaOp token,
                                       const Shape& shape,
                                       const ChannelHandle& handle,
                                       bool is_host_transfer) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
@@ -399,7 +399,7 @@ XlaOp XlaBuilderFriend::BuildRngGetAndUpdateState(XlaBuilder* builder,
 
                                                   int64_t delta,
                                                   const Shape& shape) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_delta(delta);
     *instr.mutable_shape() = shape.ToProto();
@@ -435,7 +435,7 @@ XlaOp operator<<(XlaOp x, XlaOp y) { return ShiftLeft(x, y); }
 
 XlaOp operator>>(XlaOp x, XlaOp y) {
   XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, builder->GetShapePtr(x));
     if (!ShapeUtil::ElementIsIntegral(*shape)) {
       return InvalidArgument(
@@ -450,7 +450,7 @@ XlaOp operator>>(XlaOp x, XlaOp y) {
   });
 }
 
-StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
+absl::StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
   TF_RETURN_IF_ERROR(first_error_);
   TF_RETURN_IF_ERROR(CheckOpBuilder(op));
   auto it = handle_to_index_.find(op.handle());
@@ -460,12 +460,12 @@ StatusOr<const Shape*> XlaBuilder::GetShapePtr(XlaOp op) const {
   return instruction_shapes_.at(it->second).get();
 }
 
-StatusOr<Shape> XlaBuilder::GetShape(XlaOp op) const {
+absl::StatusOr<Shape> XlaBuilder::GetShape(XlaOp op) const {
   TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(op));
   return *shape;
 }
 
-StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
+absl::StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
     absl::Span<const XlaOp> operands) const {
   std::vector<Shape> operand_shapes;
   operand_shapes.reserve(operands.size());
@@ -532,7 +532,7 @@ XlaOp XlaBuilder::ReportError(const Status& error) {
   return XlaOp(this);
 }
 
-XlaOp XlaBuilder::ReportErrorOrReturn(const StatusOr<XlaOp>& op) {
+XlaOp XlaBuilder::ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op) {
   if (!first_error_.ok()) {
     return XlaOp(this);
   }
@@ -543,11 +543,12 @@ XlaOp XlaBuilder::ReportErrorOrReturn(const StatusOr<XlaOp>& op) {
 }
 
 XlaOp XlaBuilder::ReportErrorOrReturn(
-    absl::FunctionRef<StatusOr<XlaOp>()> op_creator) {
+    absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator) {
   return ReportErrorOrReturn(op_creator());
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64_t root_id) const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(
+    int64_t root_id) const {
   TF_RETURN_IF_ERROR(first_error_);
   TF_ASSIGN_OR_RETURN(const HloInstructionProto* root_proto,
                       LookUpInstructionByHandle(root_id));
@@ -578,12 +579,12 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64_t root_id) const {
   return program_shape;
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
   TF_RET_CHECK(!instructions_.empty());
   return GetProgramShape(instructions_.back().id());
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
+absl::StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
@@ -702,21 +703,22 @@ Status XlaBuilder::GetCurrentStatus() const {
   return OkStatus();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    bool remove_dynamic_dimensions) {
   TF_RETURN_IF_ERROR(GetCurrentStatus());
   return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
-                                           bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    XlaOp root, bool remove_dynamic_dimensions) {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
   return Build(root.handle(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(int64_t root_id,
-                                           bool remove_dynamic_dimensions) {
+absl::StatusOr<XlaComputation> XlaBuilder::Build(
+    int64_t root_id, bool remove_dynamic_dimensions) {
   TF_RETURN_IF_ERROR(GetCurrentStatus());
 
   // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
@@ -900,7 +902,7 @@ XlaOp XlaBuilder::DynamicBroadcastInDim(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
+absl::StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, XlaOp operand,
     absl::Span<const int64_t> broadcast_dimensions) {
   TF_RETURN_IF_ERROR(first_error_);
@@ -932,8 +934,8 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
 }
 
-StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
-                                                 XlaOp operand) {
+absl::StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(
+    const Shape& output_shape, XlaOp operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -990,7 +992,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
 }
 
 XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferUnaryOpShape(unop, *operand_shape));
@@ -1003,7 +1005,7 @@ namespace {
 // Broadcasts an origin XLA op to the rank of target_shape.
 // Does not broadcast rank dimensions to match, only expands rank.
 // Is identity function if origin rank matches target rank.
-StatusOr<XlaOp> BroadcastToTargetRank(
+absl::StatusOr<XlaOp> BroadcastToTargetRank(
     XlaOp origin, const Shape& origin_shape, const Shape& target_shape,
     absl::Span<const int64_t> broadcast_dimensions) {
   if (ShapeUtil::IsScalar(origin_shape)) {
@@ -1136,7 +1138,7 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64_t> broadcast_dimensions,
                            std::optional<ComparisonDirection> direction,
                            std::optional<Comparison::Type> type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(
@@ -1182,12 +1184,16 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                                   this, rhs, lhs, *lhs_shape));
         }
       } else {
-        TF_ASSIGN_OR_RETURN(UnboundedBroadcastResult broadcast_result,
-                            BroadcastToOutputShapeWithUnbounded(
-                                this, lhs, *lhs_shape, rhs, *rhs_shape, shape,
-                                broadcast_dimensions));
-        updated_lhs = broadcast_result.lhs;
-        updated_rhs = broadcast_result.rhs;
+        if (!ShapeUtil::SameDimensions(*lhs_shape, *rhs_shape)) {
+          Shape output_shape = shape;
+          output_shape.set_element_type(lhs_shape->element_type());
+          TF_ASSIGN_OR_RETURN(UnboundedBroadcastResult broadcast_result,
+                              BroadcastToOutputShapeWithUnbounded(
+                                  this, lhs, *lhs_shape, rhs, *rhs_shape,
+                                  output_shape, broadcast_dimensions));
+          updated_lhs = broadcast_result.lhs;
+          updated_rhs = broadcast_result.rhs;
+        }
       }
     }
 
@@ -1214,24 +1220,26 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
 
 XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
                                       XlaOp lhs, XlaOp rhs) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), binop, {lhs, rhs});
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                    ComparisonDirection direction) {
+absl::StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs,
+                                          XlaOp rhs,
+                                          ComparisonDirection direction) {
   TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(lhs));
   return Compare(
       shape, lhs, rhs, direction,
       Comparison::DefaultComparisonType(operand_shape.element_type()));
 }
 
-StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                    ComparisonDirection direction,
-                                    Comparison::Type type) {
+absl::StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs,
+                                          XlaOp rhs,
+                                          ComparisonDirection direction,
+                                          Comparison::Type type) {
   HloInstructionProto instr;
   instr.set_comparison_direction(ComparisonDirectionToString(direction));
   instr.set_comparison_type(ComparisonTypeToString(type));
@@ -1260,7 +1268,7 @@ absl::StatusOr<XlaOp> XlaBuilder::BroadcastScalarToOutputShape(XlaOp scalar,
 }
 
 XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
@@ -1319,7 +1327,7 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
 }
 
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (literal.shape().IsArray() && literal.element_count() > 1 &&
         literal.IsAllFirst()) {
       Literal scalar = LiteralUtil::GetFirstScalarLiteral(literal);
@@ -1340,7 +1348,7 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
 }
 
 XlaOp XlaBuilder::Iota(const Shape& shape, int64_t iota_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!shape.is_static()) {
       return InvalidArgument(
           "The output of iota must not have dynamic dimensions: %s",
@@ -1359,7 +1367,7 @@ XlaOp XlaBuilder::Iota(PrimitiveType type, int64_t size) {
 
 XlaOp XlaBuilder::Call(const XlaComputation& computation,
                        absl::Span<const XlaOp> operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
@@ -1381,7 +1389,7 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
 XlaOp XlaBuilder::Parameter(
     int64_t parameter_number, const Shape& shape, const std::string& name,
     const std::vector<bool>& replicated_at_leaf_buffers) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (!parameter_numbers_.insert(parameter_number).second) {
       return InvalidArgument("parameter %d already registered",
@@ -1402,7 +1410,7 @@ XlaOp XlaBuilder::Parameter(
 
 XlaOp XlaBuilder::Broadcast(XlaOp operand,
                             absl::Span<const int64_t> broadcast_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape& shape,
@@ -1425,9 +1433,9 @@ XlaOp XlaBuilder::Broadcast(XlaOp operand,
 }
 
 XlaOp XlaBuilder::BroadcastInDim(
-    XlaOp operand, const absl::Span<const int64_t> out_dim_size,
-    const absl::Span<const int64_t> broadcast_dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    XlaOp operand, absl::Span<const int64_t> out_dim_size,
+    absl::Span<const int64_t> broadcast_dimensions) {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
@@ -1484,8 +1492,9 @@ XlaOp XlaBuilder::BroadcastInDim(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape, XlaOp operand,
-                                            int64_t inferred_dimension) {
+absl::StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape,
+                                                  XlaOp operand,
+                                                  int64_t inferred_dimension) {
   TF_RETURN_IF_ERROR(first_error_);
   if (shape.is_unbounded_dynamic()) {
     return InvalidArgument(
@@ -1503,7 +1512,7 @@ StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
                         absl::Span<const int64_t> limit_indices,
                         absl::Span<const int64_t> strides) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferSliceShape(
                                          *operand_shape, start_indices,
@@ -1512,7 +1521,7 @@ XlaOp XlaBuilder::Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::SliceInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64_t> start_indices,
     absl::Span<const int64_t> limit_indices,
     absl::Span<const int64_t> strides) {
@@ -1530,7 +1539,7 @@ StatusOr<XlaOp> XlaBuilder::SliceInternal(
 XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64_t start_index,
                              int64_t limit_index, int64_t stride,
                              int64_t dimno) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     std::vector<int64_t> starts(shape->rank(), 0);
     std::vector<int64_t> limits(shape->dimensions().begin(),
@@ -1546,7 +1555,7 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64_t start_index,
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64_t> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -1561,7 +1570,7 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
     const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
     absl::Span<const int64_t> slice_sizes) {
   HloInstructionProto instr;
@@ -1578,7 +1587,7 @@ StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
 
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -1594,7 +1603,7 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
     const Shape& shape, XlaOp operand, XlaOp update,
     absl::Span<const XlaOp> start_indices) {
   HloInstructionProto instr;
@@ -1608,7 +1617,7 @@ StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
 
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
@@ -1619,7 +1628,7 @@ XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
     const Shape& shape, absl::Span<const XlaOp> operands, int64_t dimension) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
@@ -1631,7 +1640,7 @@ StatusOr<XlaOp> XlaBuilder::ConcatInDimInternal(
 
 XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
                       const PaddingConfig& padding_config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* padding_value_shape,
                         GetShapePtr(padding_value));
@@ -1644,7 +1653,7 @@ XlaOp XlaBuilder::Pad(XlaOp operand, XlaOp padding_value,
 
 XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
                            int64_t pad_lo, int64_t pad_hi) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     PaddingConfig padding_config = MakeNoPaddingConfig(shape->rank());
     auto* dims = padding_config.mutable_dimensions(dimno);
@@ -1654,9 +1663,9 @@ XlaOp XlaBuilder::PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
-                                        XlaOp padding_value,
-                                        const PaddingConfig& padding_config) {
+absl::StatusOr<XlaOp> XlaBuilder::PadInternal(
+    const Shape& shape, XlaOp operand, XlaOp padding_value,
+    const PaddingConfig& padding_config) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   *instr.mutable_padding_config() = padding_config;
@@ -1667,7 +1676,7 @@ StatusOr<XlaOp> XlaBuilder::PadInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
                           absl::Span<const int64_t> new_sizes,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape shape, ShapeInference::InferReshapeShape(
                                                *operand_shape, dimensions,
@@ -1681,7 +1690,7 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
 
 XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     std::vector<int64_t> dimensions(shape->dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -1691,7 +1700,7 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64_t> new_sizes,
 
 XlaOp XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
                           int64_t inferred_dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     return ReshapeInternal(shape, operand, inferred_dimension);
   });
 }
@@ -1700,7 +1709,7 @@ XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
                                  absl::Span<const XlaOp> dim_sizes,
                                  absl::Span<const int64_t> new_size_bounds,
                                  const std::vector<bool>& dims_are_dynamic) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> dim_size_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& dim_size_shapes,
@@ -1728,7 +1737,7 @@ XlaOp XlaBuilder::DynamicReshape(XlaOp operand,
 
 XlaOp XlaBuilder::Collapse(XlaOp operand,
                            absl::Span<const int64_t> dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (dimensions.size() <= 1) {
       // Not collapsing anything, trivially we can return the operand versus
       // enqueueing a trivial reshape.
@@ -1768,14 +1777,15 @@ XlaOp XlaBuilder::Collapse(XlaOp operand,
 }
 
 // Dummy pass-through computation returning it's parameter of shape `shape`.
-static StatusOr<XlaComputation> PassthroughComputation(const Shape& shape) {
+static absl::StatusOr<XlaComputation> PassthroughComputation(
+    const Shape& shape) {
   XlaBuilder builder("dummy");
   XlaOp out = Parameter(&builder, 0, shape, "p");
   return builder.Build(out);
 }
 
 XlaOp XlaBuilder::Select(XlaOp pred, XlaOp on_true, XlaOp on_false) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* true_shape, GetShapePtr(on_true));
     TF_ASSIGN_OR_RETURN(const Shape* false_shape, GetShapePtr(on_false));
     TF_RET_CHECK(true_shape->IsTuple() == false_shape->IsTuple());
@@ -1792,7 +1802,7 @@ XlaOp XlaBuilder::Select(XlaOp pred, XlaOp on_true, XlaOp on_false) {
 }
 
 XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
@@ -1804,15 +1814,15 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TupleInternal(const Shape& shape,
-                                          absl::Span<const XlaOp> elements) {
+absl::StatusOr<XlaOp> XlaBuilder::TupleInternal(
+    const Shape& shape, absl::Span<const XlaOp> elements) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
 }
 
 XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64_t index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* tuple_shape, GetShapePtr(tuple_data));
     if (!tuple_shape->IsTuple()) {
       return InvalidArgument(
@@ -1830,9 +1840,9 @@ XlaOp XlaBuilder::GetTupleElement(XlaOp tuple_data, int64_t index) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
-                                                    XlaOp tuple_data,
-                                                    int64_t index) {
+absl::StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
+                                                          XlaOp tuple_data,
+                                                          int64_t index) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_tuple_index(index);
@@ -1843,7 +1853,7 @@ StatusOr<XlaOp> XlaBuilder::GetTupleElementInternal(const Shape& shape,
 XlaOp XlaBuilder::Dot(XlaOp lhs, XlaOp rhs,
                       const PrecisionConfig* precision_config,
                       std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
 
     DotDimensionNumbers dimension_numbers;
@@ -1859,7 +1869,7 @@ XlaOp XlaBuilder::DotGeneral(
     XlaOp lhs, XlaOp rhs, const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(
@@ -1871,7 +1881,7 @@ XlaOp XlaBuilder::DotGeneral(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::DotGeneralInternal(
+absl::StatusOr<XlaOp> XlaBuilder::DotGeneralInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs,
     const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config) {
@@ -1890,7 +1900,7 @@ XlaOp XlaBuilder::SparseDot(
     const DotDimensionNumbers& dimension_numbers,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -1987,7 +1997,7 @@ XlaOp XlaBuilder::ConvWithGeneralDimensions(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
 
@@ -2045,7 +2055,7 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     const PrecisionConfig* precision_config,
     std::optional<PrimitiveType> preferred_element_type,
     std::optional<std::vector<bool>> window_reversal) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_RETURN_IF_ERROR(
@@ -2075,7 +2085,7 @@ XlaOp XlaBuilder::ConvGeneralDilated(
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
+absl::StatusOr<HloInstructionProto> XlaBuilder::DynamicConvInstruction(
     XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding,
     absl::Span<const int64_t> lhs_dilation,
@@ -2128,7 +2138,7 @@ XlaOp XlaBuilder::DynamicConvInputGrad(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(
@@ -2153,7 +2163,7 @@ XlaOp XlaBuilder::DynamicConvKernelGrad(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(activations, gradients, window_strides, padding,
@@ -2180,7 +2190,7 @@ XlaOp XlaBuilder::DynamicConvForward(
     int64_t feature_group_count, int64_t batch_group_count,
     const PrecisionConfig* precision_config, PaddingType padding_type,
     std::optional<PrimitiveType> preferred_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(
         HloInstructionProto instr,
         DynamicConvInstruction(
@@ -2193,7 +2203,7 @@ XlaOp XlaBuilder::DynamicConvForward(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
     const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
     absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding,
@@ -2219,7 +2229,7 @@ StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
 
 XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
                       const absl::Span<const int64_t> fft_length) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferFftShape(
                                          *operand_shape, fft_type, fft_length));
@@ -2227,7 +2237,7 @@ XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::FftInternal(
+absl::StatusOr<XlaOp> XlaBuilder::FftInternal(
     const Shape& shape, XlaOp operand, const FftType fft_type,
     const absl::Span<const int64_t> fft_length) {
   HloInstructionProto instr;
@@ -2240,7 +2250,7 @@ StatusOr<XlaOp> XlaBuilder::FftInternal(
   return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
 }
 
-StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
+absl::StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
     const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options) {
   HloInstructionProto instr;
   *instr.mutable_triangular_solve_options() = std::move(options);
@@ -2249,8 +2259,8 @@ StatusOr<XlaOp> XlaBuilder::TriangularSolveInternal(
   return AddInstruction(std::move(instr), HloOpcode::kTriangularSolve, {a, b});
 }
 
-StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
-                                             bool lower) {
+absl::StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
+                                                   bool lower) {
   HloInstructionProto instr;
   CholeskyOptions& options = *instr.mutable_cholesky_options();
   options.set_lower(lower);
@@ -2260,7 +2270,7 @@ StatusOr<XlaOp> XlaBuilder::CholeskyInternal(const Shape& shape, XlaOp a,
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const std::string& config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Given shape to Infeed must have a layout");
@@ -2334,7 +2344,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const std::string& config) {
 
 XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
                                   const std::string& config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Given shape to Infeed must have a layout");
     }
@@ -2356,7 +2366,7 @@ XlaOp XlaBuilder::InfeedWithToken(XlaOp token, const Shape& shape,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
+absl::StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
     const Shape& infeed_instruction_shape, XlaOp token,
     const std::string& config) {
   HloInstructionProto instr;
@@ -2367,7 +2377,7 @@ StatusOr<XlaOp> XlaBuilder::InfeedWithTokenInternal(
 
 void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
                          const std::string& outfeed_config) {
-  ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
@@ -2440,7 +2450,7 @@ void XlaBuilder::Outfeed(XlaOp operand, const Shape& shape_with_layout,
 XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
                                    const Shape& shape_with_layout,
                                    const std::string& outfeed_config) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
       return InvalidArgument("Given shape to Outfeed must have a layout");
@@ -2457,7 +2467,7 @@ XlaOp XlaBuilder::OutfeedWithToken(XlaOp operand, XlaOp token,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
+absl::StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
     XlaOp operand, XlaOp token, const Shape& shape_with_layout,
     const std::string& outfeed_config) {
   HloInstructionProto instr;
@@ -2469,7 +2479,7 @@ StatusOr<XlaOp> XlaBuilder::OutfeedWithTokenInternal(
 }
 
 XlaOp XlaBuilder::CreateToken() {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll);
@@ -2477,7 +2487,7 @@ XlaOp XlaBuilder::CreateToken() {
 }
 
 XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
@@ -2506,7 +2516,7 @@ XlaOp XlaBuilder::CustomCall(
     const Literal* literal, std::optional<Window> window,
     std::optional<ConvolutionDimensionNumbers> dnums,
     CustomCallSchedule schedule, CustomCallApiVersion api_version) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
@@ -2543,7 +2553,7 @@ XlaOp XlaBuilder::CustomCall(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
+absl::StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
     const std::string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation* computation, const Shape& shape,
     const std::string& opaque,
@@ -2614,7 +2624,7 @@ XlaOp XlaBuilder::CustomCall(
         output_operand_aliasing,
     const Literal* literal, CustomCallSchedule schedule,
     CustomCallApiVersion api_version) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
@@ -2652,7 +2662,7 @@ XlaOp XlaBuilder::CustomCall(
 }
 
 XlaOp XlaBuilder::OptimizationBarrier(XlaOp operand) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     Shape shape = *operand_shape;
     HloInstructionProto instr;
@@ -2664,7 +2674,7 @@ XlaOp XlaBuilder::OptimizationBarrier(XlaOp operand) {
 
 XlaOp XlaBuilder::Transpose(XlaOp operand,
                             absl::Span<const int64_t> permutation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
                                          *operand_shape, permutation));
@@ -2672,7 +2682,7 @@ XlaOp XlaBuilder::Transpose(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TransposeInternal(
+absl::StatusOr<XlaOp> XlaBuilder::TransposeInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64_t> permutation) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
@@ -2683,7 +2693,7 @@ StatusOr<XlaOp> XlaBuilder::TransposeInternal(
 }
 
 XlaOp XlaBuilder::Rev(XlaOp operand, absl::Span<const int64_t> dimensions) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReverseShape(
                                          *operand_shape, dimensions));
@@ -2691,8 +2701,8 @@ XlaOp XlaBuilder::Rev(XlaOp operand, absl::Span<const int64_t> dimensions) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RevInternal(const Shape& shape, XlaOp operand,
-                                        absl::Span<const int64_t> dimensions) {
+absl::StatusOr<XlaOp> XlaBuilder::RevInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const int64_t> dimensions) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   for (int64_t dim : dimensions) {
@@ -2704,7 +2714,7 @@ StatusOr<XlaOp> XlaBuilder::RevInternal(const Shape& shape, XlaOp operand,
 XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
                        const XlaComputation& comparator, int64_t dimension,
                        bool is_stable) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
                         GetOperandShapes(operands));
@@ -2716,10 +2726,11 @@ XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
-                                         absl::Span<const XlaOp> operands,
-                                         const XlaComputation& comparator,
-                                         int64_t dimension, bool is_stable) {
+absl::StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
+                                               absl::Span<const XlaOp> operands,
+                                               const XlaComputation& comparator,
+                                               int64_t dimension,
+                                               bool is_stable) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_is_stable(is_stable);
@@ -2733,7 +2744,7 @@ StatusOr<XlaOp> XlaBuilder::SortInternal(const Shape& shape,
 }
 
 XlaOp XlaBuilder::TopK(XlaOp operand, int64_t k, bool largest) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -2742,8 +2753,9 @@ XlaOp XlaBuilder::TopK(XlaOp operand, int64_t k, bool largest) {
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape, XlaOp operand,
-                                         int64_t k, bool largest) {
+absl::StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape,
+                                               XlaOp operand, int64_t k,
+                                               bool largest) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_k(k);
@@ -2753,7 +2765,7 @@ StatusOr<XlaOp> XlaBuilder::TopKInternal(const Shape& shape, XlaOp operand,
 
 XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
                                      PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
                                          *operand_shape, new_element_type));
@@ -2767,7 +2779,7 @@ XlaOp XlaBuilder::ConvertElementType(XlaOp operand,
 
 XlaOp XlaBuilder::BitcastConvertType(XlaOp operand,
                                      PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferBitcastConvertShape(
                                          *operand_shape, new_element_type));
@@ -2775,8 +2787,8 @@ XlaOp XlaBuilder::BitcastConvertType(XlaOp operand,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
-                                                       XlaOp operand) {
+absl::StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
+                                                             XlaOp operand) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
@@ -2785,7 +2797,7 @@ StatusOr<XlaOp> XlaBuilder::BitcastConvertTypeInternal(const Shape& shape,
 
 XlaOp XlaBuilder::StochasticConvertType(XlaOp operand, XlaOp random,
                                         PrimitiveType new_element_type) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* random_shape, GetShapePtr(random));
     TF_ASSIGN_OR_RETURN(Shape shape,
@@ -2804,7 +2816,7 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
                       const XlaComputation& computation,
                       absl::Span<const int64_t> dimensions,
                       absl::Span<const XlaOp> static_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!static_operands.empty()) {
       return Unimplemented("static_operands is not supported in Map");
     }
@@ -2846,7 +2858,7 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
 XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
                         absl::Span<const XlaOp> parameters,
                         const Shape& shape) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Check the number of parameters per RNG distribution.
     switch (distribution) {
       case RandomDistribution::RNG_NORMAL:
@@ -2866,9 +2878,9 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RngOpInternal(RandomDistribution distribution,
-                                          absl::Span<const XlaOp> parameters,
-                                          const Shape& shape) {
+absl::StatusOr<XlaOp> XlaBuilder::RngOpInternal(
+    RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+    const Shape& shape) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_distribution(distribution);
@@ -2886,7 +2898,7 @@ XlaOp XlaBuilder::RngUniform(XlaOp a, XlaOp b, const Shape& shape) {
 
 XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
                                   XlaOp initial_state, const Shape& shape) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
     Shape output_shape = shape;
@@ -2906,7 +2918,7 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
+absl::StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
     const Shape& full_result_shape, RandomAlgorithm algorithm,
     XlaOp initial_state) {
   HloInstructionProto instr;
@@ -2918,7 +2930,7 @@ StatusOr<XlaOp> XlaBuilder::RngBitGeneratorInternal(
 
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, XlaOp init) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Infer shape.
     TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
@@ -2931,10 +2943,10 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::WhileInternal(const Shape& shape,
-                                          const XlaComputation& condition,
-                                          const XlaComputation& body,
-                                          XlaOp init) {
+absl::StatusOr<XlaOp> XlaBuilder::WhileInternal(const Shape& shape,
+                                                const XlaComputation& condition,
+                                                const XlaComputation& body,
+                                                XlaOp init) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   // Body comes before condition computation in the vector.
@@ -2947,7 +2959,7 @@ XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          absl::Span<const int64_t> slice_sizes,
                          bool indices_are_sorted) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* input_shape, GetShapePtr(input));
     TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
                         GetShapePtr(start_indices));
@@ -2959,7 +2971,7 @@ XlaOp XlaBuilder::Gather(XlaOp input, XlaOp start_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::GatherInternal(
+absl::StatusOr<XlaOp> XlaBuilder::GatherInternal(
     const Shape& shape, XlaOp input, XlaOp start_indices,
     const GatherDimensionNumbers& dimension_numbers,
     absl::Span<const int64_t> slice_sizes, bool indices_are_sorted) {
@@ -2989,7 +3001,7 @@ XlaOp XlaBuilder::Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
                           const XlaComputation& update_computation,
                           const ScatterDimensionNumbers& dimension_numbers,
                           bool indices_are_sorted, bool unique_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (inputs.empty()) {
       return InvalidArgument("Scatter inputs cannot be empty.");
     }
@@ -3022,12 +3034,12 @@ XlaOp XlaBuilder::Scatter(absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ScatterInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ScatterInternal(
     const Shape& shape, absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
     absl::Span<const XlaOp> updates, const XlaComputation& update_computation,
     const ScatterDimensionNumbers& dimension_numbers, bool indices_are_sorted,
     bool unique_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_indices_are_sorted(indices_are_sorted);
     instr.set_unique_indices(unique_indices);
@@ -3048,7 +3060,7 @@ XlaOp XlaBuilder::Conditional(XlaOp predicate, XlaOp true_operand,
                               const XlaComputation& true_computation,
                               XlaOp false_operand,
                               const XlaComputation& false_computation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(predicate));
 
     if (!ShapeUtil::IsScalar(*shape) || shape->element_type() != PRED) {
@@ -3068,7 +3080,7 @@ XlaOp XlaBuilder::Conditional(
     XlaOp branch_index,
     absl::Span<const XlaComputation* const> branch_computations,
     absl::Span<const XlaOp> branch_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(branch_index));
 
     if (!ShapeUtil::IsScalar(*shape) || shape->element_type() != S32) {
@@ -3087,7 +3099,7 @@ XlaOp XlaBuilder::AllReduceImpl(XlaOp operand,
                                 const std::optional<Shape>& layout,
                                 const std::optional<bool> use_global_device_ids,
                                 bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> operand_shapes;
@@ -3173,7 +3185,7 @@ XlaOp XlaBuilder::AllGatherImpl(const XlaOp operand,
                                 const std::optional<Layout>& layout,
                                 const std::optional<bool> use_global_device_ids,
                                 bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
@@ -3225,7 +3237,7 @@ XlaOp XlaBuilder::ConditionalImpl(
     XlaOp branch_index,
     absl::Span<const XlaComputation* const> branch_computations,
     absl::Span<const XlaOp> branch_operands) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* branch_index_shape,
@@ -3280,7 +3292,7 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
                          absl::Span<const XlaOp> init_values,
                          const XlaComputation& computation,
                          absl::Span<const int64_t> dimensions_to_reduce) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
 
@@ -3304,11 +3316,11 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReduceInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ReduceInternal(
     const Shape& shape, absl::Span<const XlaOp> all_operands,
     const XlaComputation& computation,
     absl::Span<const int64_t> dimensions_to_reduce) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = shape.ToProto();
 
@@ -3323,7 +3335,7 @@ StatusOr<XlaOp> XlaBuilder::ReduceInternal(
 
 XlaOp XlaBuilder::ReduceAll(XlaOp operand, XlaOp init_value,
                             const XlaComputation& computation) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<int64_t> all_dimnos(operand_shape->rank());
     std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
@@ -3347,7 +3359,7 @@ XlaOp XlaBuilder::ReduceWindow(absl::Span<const XlaOp> operands,
                                absl::Span<const int64_t> window_dimensions,
                                absl::Span<const int64_t> window_strides,
                                Padding padding) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     const Shape* operand_shape = nullptr;
     for (const auto& operand : operands) {
       TF_ASSIGN_OR_RETURN(operand_shape, GetShapePtr(operand));
@@ -3401,7 +3413,7 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     absl::Span<const int64_t> window_dilations,
     absl::Span<const std::pair<int64_t, int64_t>> padding) {
   std::vector<const Shape*> operand_shapes, init_shapes;
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (operands.size() == 1) {
       const auto& operand = operands[0];
       const auto& init_value = init_values[0];
@@ -3437,7 +3449,7 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
+absl::StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
     absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
     const XlaComputation& computation,
     absl::Span<const int64_t> window_dimensions,
@@ -3472,7 +3484,7 @@ StatusOr<HloInstructionProto> XlaBuilder::ReduceWindowInternal(
   return instr;
 }
 
-StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
+absl::StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
     const Shape& shape, XlaOp operand, XlaOp init_value,
     const XlaComputation& computation, Window window) {
   HloInstructionProto instr;
@@ -3486,7 +3498,7 @@ StatusOr<XlaOp> XlaBuilder::ReduceWindowInternal(
 
 XlaOp XlaBuilder::BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
                                     float epsilon, int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3509,7 +3521,7 @@ XlaOp XlaBuilder::BatchNormTraining(XlaOp operand, XlaOp scale, XlaOp offset,
 XlaOp XlaBuilder::BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
                                      XlaOp mean, XlaOp variance, float epsilon,
                                      int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3534,7 +3546,7 @@ XlaOp XlaBuilder::BatchNormInference(XlaOp operand, XlaOp scale, XlaOp offset,
 XlaOp XlaBuilder::BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
                                 XlaOp batch_var, XlaOp grad_output,
                                 float epsilon, int64_t feature_index) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
 
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
@@ -3570,7 +3582,7 @@ XlaOp XlaBuilder::AllGather(XlaOp operand, int64_t all_gather_dimension,
 
 XlaOp XlaBuilder::CrossReplicaSum(
     XlaOp operand, absl::Span<const ReplicaGroup> replica_groups) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* shape, GetShapePtr(operand));
     const Shape* element_shape;
     if (shape->IsTuple()) {
@@ -3614,7 +3626,7 @@ XlaOp XlaBuilder::ReduceScatter(
     const std::optional<ChannelHandle>& channel_id,
     const std::optional<Layout>& layout,
     const std::optional<bool> use_global_device_ids) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> operand_shapes;
@@ -3686,7 +3698,7 @@ XlaOp XlaBuilder::AllToAllArray(
     XlaOp operand, int64_t split_dimension, int64_t concat_dimension,
     int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape all_to_all_shape,
@@ -3745,7 +3757,7 @@ XlaOp XlaBuilder::AllToAllTuple(
     absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<Layout>& layout,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(auto operand_shapes, this->GetOperandShapes(operands));
     std::vector<const Shape*> operand_shape_ptrs;
@@ -3788,10 +3800,10 @@ XlaOp XlaBuilder::AllToAllTuple(
     int64_t split_count, absl::Span<const ReplicaGroup> replica_groups,
     const std::optional<Layout>& layout,
     const std::optional<ChannelHandle>& channel_id) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
-    // The HloInstruction for Alltoall currently only handles the data
+    // The HloInstruction for AllToAll currently only handles the data
     // communication: it accepts N already split parts and scatters them to N
     // cores, and each core gathers the N received parts into a tuple as the
     // output. So here we explicitly split the operand before the hlo alltoall,
@@ -3868,7 +3880,7 @@ XlaOp XlaBuilder::CollectivePermuteImpl(
     XlaOp operand,
     const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
     const std::optional<ChannelHandle>& channel_id, bool async) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(
@@ -3893,7 +3905,7 @@ XlaOp XlaBuilder::CollectivePermuteImpl(
 }
 
 XlaOp XlaBuilder::ReplicaId() {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = ShapeUtil::MakeShape(U32, {}).ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kReplicaId, {});
@@ -3906,7 +3918,7 @@ XlaOp XlaBuilder::SelectAndScatter(XlaOp operand, const XlaComputation& select,
                                    Padding padding, XlaOp source,
                                    XlaOp init_value,
                                    const XlaComputation& scatter) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     std::vector<std::pair<int64_t, int64_t>> padding_values =
@@ -3945,7 +3957,7 @@ XlaOp XlaBuilder::SelectAndScatter(XlaOp operand, const XlaComputation& select,
   });
 }
 
-StatusOr<HloInstructionProto> XlaBuilder::SelectAndScatterInternal(
+absl::StatusOr<HloInstructionProto> XlaBuilder::SelectAndScatterInternal(
     XlaOp operand, const XlaComputation& select,
     absl::Span<const int64_t> window_dimensions,
     absl::Span<const int64_t> window_strides,
@@ -3981,7 +3993,7 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     absl::Span<const int64_t> window_strides,
     absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
     XlaOp init_value, const XlaComputation& scatter) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(HloInstructionProto instr,
                         SelectAndScatterInternal(
                             operand, select, window_dimensions, window_strides,
@@ -3994,7 +4006,7 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
 
 XlaOp XlaBuilder::ReducePrecision(XlaOp operand, const int exponent_bits,
                                   const int mantissa_bits) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferReducePrecisionShape(
@@ -4004,10 +4016,9 @@ XlaOp XlaBuilder::ReducePrecision(XlaOp operand, const int exponent_bits,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(const Shape& shape,
-                                                    XlaOp operand,
-                                                    const int exponent_bits,
-                                                    const int mantissa_bits) {
+absl::StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(
+    const Shape& shape, XlaOp operand, const int exponent_bits,
+    const int mantissa_bits) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_exponent_bits(exponent_bits);
@@ -4017,7 +4028,7 @@ StatusOr<XlaOp> XlaBuilder::ReducePrecisionInternal(const Shape& shape,
 }
 
 void XlaBuilder::Send(XlaOp operand, const ChannelHandle& handle) {
-  ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Send HLO takes two operands: a data operand and a token. Generate the
     // token to pass into the send.
     // TODO(b/80000000): Remove this when clients have been updated to handle
@@ -4033,7 +4044,7 @@ void XlaBuilder::Send(XlaOp operand, const ChannelHandle& handle) {
 
 XlaOp XlaBuilder::SendWithToken(XlaOp operand, XlaOp token,
                                 const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) {
       return InvalidArgument("Send must use a device-to-device channel");
     }
@@ -4046,7 +4057,7 @@ XlaOp XlaBuilder::SendWithToken(XlaOp operand, XlaOp token,
 }
 
 XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     // Recv HLO takes a single token operand. Generate the token to pass into
     // the Recv and RecvDone instructions.
     // TODO(b/80000000): Remove this when clients have been updated to handle
@@ -4072,7 +4083,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
 
 XlaOp XlaBuilder::RecvWithToken(XlaOp token, const Shape& shape,
                                 const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (handle.type() != ChannelHandle::DEVICE_TO_DEVICE) {
       return InvalidArgument("Recv must use a device-to-device channel");
     }
@@ -4087,7 +4098,7 @@ XlaOp XlaBuilder::RecvWithToken(XlaOp token, const Shape& shape,
 XlaOp XlaBuilder::SendToHost(XlaOp operand, XlaOp token,
                              const Shape& shape_with_layout,
                              const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
       return InvalidArgument("Shape passed to SendToHost must have a layout");
     }
@@ -4135,7 +4146,7 @@ XlaOp XlaBuilder::SendToHost(XlaOp operand, XlaOp token,
 
 XlaOp XlaBuilder::RecvFromHost(XlaOp token, const Shape& shape,
                                const ChannelHandle& handle) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     if (!LayoutUtil::HasLayout(shape)) {
       return InvalidArgument("Shape passed to RecvFromHost must have a layout");
     }
@@ -4175,7 +4186,7 @@ XlaOp XlaBuilder::RecvFromHost(XlaOp token, const Shape& shape,
 }
 
 XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
@@ -4193,7 +4204,7 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
 }
 
 XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
 
     Shape shape = *operand_shape;
@@ -4208,7 +4219,7 @@ XlaOp XlaBuilder::RemoveDynamicDimension(XlaOp operand, int64_t dimension) {
 
 XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val,
                                    int64_t dimension) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* val_shape, GetShapePtr(val));
 
@@ -4219,9 +4230,10 @@ XlaOp XlaBuilder::SetDimensionSize(XlaOp operand, XlaOp val,
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
-                                                     XlaOp operand, XlaOp val,
-                                                     int64_t dimension) {
+absl::StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
+                                                           XlaOp operand,
+                                                           XlaOp val,
+                                                           int64_t dimension) {
   // Note that both SetDimensionSize and RemoveDynamicDimension use
   // HloOpcode::kSetDimensionSize internally. However, The SetDimensionSize
   // builder always produces an output with a dynamic bound on the given
@@ -4238,7 +4250,7 @@ StatusOr<XlaOp> XlaBuilder::SetDimensionSizeInternal(const Shape& shape,
                         {operand, val});
 }
 
-StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
+absl::StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   // Verify that the handle is valid.
@@ -4250,7 +4262,7 @@ StatusOr<bool> XlaBuilder::IsConstant(XlaOp operand) const {
   return is_constant;
 }
 
-StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+absl::StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     XlaOp root_op, bool dynamic_dimension_is_minus_one) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
   if (!is_constant) {
@@ -4518,9 +4530,9 @@ XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   return OkStatus();
 }
 
-StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
-                                           HloOpcode opcode,
-                                           absl::Span<const XlaOp> operands) {
+absl::StatusOr<XlaOp> XlaBuilder::AddInstruction(
+    HloInstructionProto&& instr, HloOpcode opcode,
+    absl::Span<const XlaOp> operands) {
   TF_RETURN_IF_ERROR(first_error_);
 
   const int64_t handle = GetNextId();
@@ -4566,8 +4578,8 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
   return op;
 }
 
-StatusOr<XlaOp> XlaBuilder::AddOpWithShape(HloOpcode opcode, const Shape& shape,
-                                           absl::Span<const XlaOp> operands) {
+absl::StatusOr<XlaOp> XlaBuilder::AddOpWithShape(
+    HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), opcode, operands);
@@ -4629,25 +4641,25 @@ void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
   }
 }
 
-StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
+absl::StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp op) const {
   TF_RETURN_IF_ERROR(first_error_);
   return LookUpInstructionInternal<const HloInstructionProto*>(op);
 }
 
-StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstructionByHandle(
-    int64_t handle) const {
+absl::StatusOr<const HloInstructionProto*>
+XlaBuilder::LookUpInstructionByHandle(int64_t handle) const {
   return LookUpInstructionByHandleInternal<const HloInstructionProto*>(handle);
 }
 
-StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstruction(
+absl::StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstruction(
     const XlaOp op) {
   TF_RETURN_IF_ERROR(first_error_);
   return LookUpInstructionInternal<HloInstructionProto*>(op);
 }
 
-StatusOr<HloInstructionProto*> XlaBuilder::LookUpMutableInstructionByHandle(
-    int64_t handle) {
+absl::StatusOr<HloInstructionProto*>
+XlaBuilder::LookUpMutableInstructionByHandle(int64_t handle) {
   return LookUpInstructionByHandleInternal<HloInstructionProto*>(handle);
 }
 
@@ -4678,8 +4690,8 @@ XlaOp Broadcast(const XlaOp operand,
 }
 
 XlaOp BroadcastInDim(const XlaOp operand,
-                     const absl::Span<const int64_t> out_dim_size,
-                     const absl::Span<const int64_t> broadcast_dimensions) {
+                     absl::Span<const int64_t> out_dim_size,
+                     absl::Span<const int64_t> broadcast_dimensions) {
   return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
@@ -4785,7 +4797,7 @@ static XlaOp CompareTotalOrder(const XlaOp lhs, const XlaOp rhs,
                                absl::Span<const int64_t> broadcast_dimensions,
                                ComparisonDirection comparison_direction) {
   auto b = lhs.builder();
-  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return b->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto operand_shape, b->GetShape(lhs));
     auto operand_element_type = operand_shape.element_type();
     auto compare_type =
@@ -5018,7 +5030,7 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
                       bool unit_diagonal,
                       TriangularSolveOptions::Transpose transpose_a) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(const Shape* b_shape, builder->GetShapePtr(b));
     TriangularSolveOptions options;
@@ -5034,7 +5046,7 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
 
 XlaOp Cholesky(XlaOp a, bool lower) {
   XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+  return builder->ReportErrorOrReturn([&]() -> absl::StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape* a_shape, builder->GetShapePtr(a));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferCholeskyShape(*a_shape));
@@ -5301,7 +5313,7 @@ XlaOp AllGather(const XlaOp operand, int64_t all_gather_dimension,
                                       layout, use_global_device_ids);
 }
 
-XlaOp AllGatherTuple(const absl::Span<const XlaOp> operands,
+XlaOp AllGatherTuple(absl::Span<const XlaOp> operands,
                      int64_t all_gather_dimension, int64_t shard_count,
                      absl::Span<const ReplicaGroup> replica_groups,
                      const std::optional<ChannelHandle>& channel_id,
@@ -5328,7 +5340,7 @@ XlaOp AllReduce(const XlaOp operand, const XlaComputation& computation,
                                       use_global_device_ids);
 }
 
-XlaOp AllReduceTuple(const absl::Span<const XlaOp> operands,
+XlaOp AllReduceTuple(absl::Span<const XlaOp> operands,
                      const XlaComputation& computation,
                      absl::Span<const ReplicaGroup> replica_groups,
                      const std::optional<ChannelHandle>& channel_id,
@@ -5750,7 +5762,7 @@ OpSharding GetManualSharding(const OpSharding& original, int64_t single_dim) {
   return manual;
 }
 
-StatusOr<XlaOp> ConvertSpmdFullToShardShape(
+absl::StatusOr<XlaOp> ConvertSpmdFullToShardShape(
     XlaBuilder* builder, XlaOp input, int single_dim,
     const OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims) {
@@ -5795,7 +5807,7 @@ StatusOr<XlaOp> ConvertSpmdFullToShardShape(
   }
 }
 
-StatusOr<XlaOp> ConvertSpmdShardToFullShape(
+absl::StatusOr<XlaOp> ConvertSpmdShardToFullShape(
     XlaBuilder* builder, XlaOp input, const Shape& output_shape, int single_dim,
     const OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims) {
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index f9c562376de72c..b744c679bd7a66 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -358,12 +358,12 @@ class XlaBuilder {
   // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
   // dynamic dimensions information when XLA backend can handle dynamic
   // dimensions.
-  StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = false);
+  absl::StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = false);
 
   // Overload of Build which specifies a particular root instruction for the
   // computation.
-  StatusOr<XlaComputation> Build(XlaOp root,
-                                 bool remove_dynamic_dimensions = false);
+  absl::StatusOr<XlaComputation> Build(XlaOp root,
+                                       bool remove_dynamic_dimensions = false);
 
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
@@ -379,7 +379,7 @@ class XlaBuilder {
   // compile-time constant (see `IsConstant`), returns an error.
   //
   // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(
+  absl::StatusOr<XlaComputation> BuildConstantSubGraph(
       XlaOp root_op, bool dynamic_dimension_is_minus_one = false);
 
   // Returns the first error that was encountered while building the
@@ -395,18 +395,18 @@ class XlaBuilder {
   Status GetCurrentStatus() const;
 
   // Returns the shape of the given op.
-  StatusOr<Shape> GetShape(XlaOp op) const;
+  absl::StatusOr<Shape> GetShape(XlaOp op) const;
 
   // Returns the shape of the given op.
-  virtual StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
+  virtual absl::StatusOr<const Shape*> GetShapePtr(XlaOp op) const;
 
   // Returns the (inferred) result for the current computation's shape. This
   // assumes the root instruction is the last added instruction.
-  StatusOr<ProgramShape> GetProgramShape() const;
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
 
   // Returns the (inferred) result for the current computation's shape using the
   // given operation as the root.
-  StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
+  absl::StatusOr<ProgramShape> GetProgramShape(XlaOp root) const;
 
   // Reports an error to the builder, by
   // * storing it internally and capturing a backtrace if it's the first error
@@ -420,11 +420,12 @@ class XlaBuilder {
   // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
   // If the Status was an error, reports the error to builder and returns an
   // invalid XlaOp handle.
-  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
+  XlaOp ReportErrorOrReturn(const absl::StatusOr<XlaOp>& op);
 
   // A helper function that runs a function that returns a StatusOr<XlaOp> and
   // returns an XlaOp.
-  XlaOp ReportErrorOrReturn(absl::FunctionRef<StatusOr<XlaOp>()> op_creator);
+  XlaOp ReportErrorOrReturn(
+      absl::FunctionRef<absl::StatusOr<XlaOp>()> op_creator);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
   // constant does not depend on any parameters, or on stateful operators such
@@ -432,7 +433,7 @@ class XlaBuilder {
   //
   // This tests whether a computation is a compile-time constant without
   // evaluating the computation.
-  StatusOr<bool> IsConstant(XlaOp operand) const;
+  absl::StatusOr<bool> IsConstant(XlaOp operand) const;
 
   // Adds a new input/output alias. Since the input/output shape information are
   // not available until the computation is built, any eventual error in the
@@ -480,7 +481,7 @@ class XlaBuilder {
                                          std::string value);
 
   // Returns shapes for the operands.
-  StatusOr<std::vector<Shape>> GetOperandShapes(
+  absl::StatusOr<std::vector<Shape>> GetOperandShapes(
       absl::Span<const XlaOp> operands) const;
 
   // Converts the op to string for the ease of debugging.
@@ -490,8 +491,8 @@ class XlaBuilder {
   void ToStringHelper(std::string* out, int ident, int64_t op_handle) const;
 
   // Build helper which takes the id of the root operation..
-  StatusOr<XlaComputation> Build(int64_t root_id,
-                                 bool remove_dynamic_dimensions);
+  absl::StatusOr<XlaComputation> Build(int64_t root_id,
+                                       bool remove_dynamic_dimensions);
 
   // Description for the methods below can be found in the corresponding public
   // functions section in this file.
@@ -525,9 +526,9 @@ class XlaBuilder {
   XlaOp PadInDim(XlaOp operand, XlaOp padding_value, int64_t dimno,
                  int64_t pad_lo, int64_t pad_hi);
 
-  virtual StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
-                                      XlaOp padding_value,
-                                      const PaddingConfig& padding_config);
+  virtual absl::StatusOr<XlaOp> PadInternal(
+      const Shape& shape, XlaOp operand, XlaOp padding_value,
+      const PaddingConfig& padding_config);
 
   XlaOp Reshape(XlaOp operand, absl::Span<const int64_t> dimensions,
                 absl::Span<const int64_t> new_sizes,
@@ -548,40 +549,40 @@ class XlaBuilder {
   XlaOp Slice(XlaOp operand, absl::Span<const int64_t> start_indices,
               absl::Span<const int64_t> limit_indices,
               absl::Span<const int64_t> strides);
-  virtual StatusOr<XlaOp> SliceInternal(const Shape& shape, XlaOp operand,
-                                        absl::Span<const int64_t> start_indices,
-                                        absl::Span<const int64_t> limit_indices,
-                                        absl::Span<const int64_t> strides);
+  virtual absl::StatusOr<XlaOp> SliceInternal(
+      const Shape& shape, XlaOp operand,
+      absl::Span<const int64_t> start_indices,
+      absl::Span<const int64_t> limit_indices,
+      absl::Span<const int64_t> strides);
   virtual XlaOp SliceInDim(XlaOp operand, int64_t start_index,
                            int64_t limit_index, int64_t stride, int64_t dimno);
 
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64_t> slice_sizes);
-  virtual StatusOr<XlaOp> DynamicSliceInternal(
+  virtual absl::StatusOr<XlaOp> DynamicSliceInternal(
       const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
       absl::Span<const int64_t> slice_sizes);
 
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
-  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+  virtual absl::StatusOr<XlaOp> DynamicUpdateSliceInternal(
       const Shape& shape, XlaOp operand, XlaOp update,
       absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64_t dimension);
-  virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,
-                                              absl::Span<const XlaOp> operands,
-                                              int64_t dimension);
+  virtual absl::StatusOr<XlaOp> ConcatInDimInternal(
+      const Shape& shape, absl::Span<const XlaOp> operands, int64_t dimension);
 
   XlaOp Select(XlaOp pred, XlaOp on_true, XlaOp on_false);
 
   XlaOp Tuple(absl::Span<const XlaOp> elements);
-  virtual StatusOr<XlaOp> TupleInternal(const Shape& shape,
-                                        absl::Span<const XlaOp> elements);
+  virtual absl::StatusOr<XlaOp> TupleInternal(const Shape& shape,
+                                              absl::Span<const XlaOp> elements);
 
   XlaOp GetTupleElement(XlaOp tuple_data, int64_t index);
-  virtual StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
-                                                  XlaOp tuple_data,
-                                                  int64_t index);
+  virtual absl::StatusOr<XlaOp> GetTupleElementInternal(const Shape& shape,
+                                                        XlaOp tuple_data,
+                                                        int64_t index);
 
   XlaOp Dot(XlaOp lhs, XlaOp rhs,
             const PrecisionConfig* precision_config = nullptr,
@@ -671,7 +672,7 @@ class XlaBuilder {
       const PrecisionConfig* precision_config, PaddingType padding_type,
       std::optional<PrimitiveType> preferred_element_type = std::nullopt);
 
-  StatusOr<HloInstructionProto> DynamicConvInstruction(
+  absl::StatusOr<HloInstructionProto> DynamicConvInstruction(
       XlaOp lhs, XlaOp rhs, absl::Span<const int64_t> window_strides,
       absl::Span<const std::pair<int64_t, int64_t>> padding,
       absl::Span<const int64_t> lhs_dilation,
@@ -681,7 +682,7 @@ class XlaBuilder {
       const PrecisionConfig* precision_config, PaddingType padding_type,
       std::optional<PrimitiveType> preferred_element_type = std::nullopt);
 
-  virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
+  virtual absl::StatusOr<XlaOp> ConvGeneralDilatedInternal(
       const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
       absl::Span<const int64_t> window_strides,
       absl::Span<const std::pair<int64_t, int64_t>> padding,
@@ -693,20 +694,20 @@ class XlaBuilder {
 
   XlaOp Fft(XlaOp operand, FftType fft_type,
             absl::Span<const int64_t> fft_length);
-  virtual StatusOr<XlaOp> FftInternal(const Shape& shape, XlaOp operand,
-                                      FftType fft_type,
-                                      absl::Span<const int64_t> fft_length);
+  virtual absl::StatusOr<XlaOp> FftInternal(
+      const Shape& shape, XlaOp operand, FftType fft_type,
+      absl::Span<const int64_t> fft_length);
 
-  virtual StatusOr<XlaOp> TriangularSolveInternal(
+  virtual absl::StatusOr<XlaOp> TriangularSolveInternal(
       const Shape& shape, XlaOp a, XlaOp b, TriangularSolveOptions options);
 
-  virtual StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
-                                           bool lower);
+  virtual absl::StatusOr<XlaOp> CholeskyInternal(const Shape& shape, XlaOp a,
+                                                 bool lower);
 
   XlaOp Infeed(const Shape& shape, const std::string& config = "");
   XlaOp InfeedWithToken(XlaOp token, const Shape& shape,
                         const std::string& config);
-  virtual StatusOr<XlaOp> InfeedWithTokenInternal(
+  virtual absl::StatusOr<XlaOp> InfeedWithTokenInternal(
       const Shape& infeed_instruction_shape, XlaOp token,
       const std::string& config);
 
@@ -715,7 +716,7 @@ class XlaBuilder {
   XlaOp OutfeedWithToken(XlaOp operand, XlaOp token,
                          const Shape& shape_with_layout,
                          const std::string& outfeed_config);
-  virtual StatusOr<XlaOp> OutfeedWithTokenInternal(
+  virtual absl::StatusOr<XlaOp> OutfeedWithTokenInternal(
       XlaOp operand, XlaOp token, const Shape& shape_with_layout,
       const std::string& outfeed_config);
   XlaOp Call(const XlaComputation& computation,
@@ -735,7 +736,7 @@ class XlaBuilder {
   // Internal version of CustomCall without computation that doesn't do op
   // specific error handling and expects arguments to be legal. CustomCall
   // method above calls this method after error handling.
-  virtual StatusOr<XlaOp> CustomCallInternal(
+  virtual absl::StatusOr<XlaOp> CustomCallInternal(
       const std::string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation* computation, const Shape& shape_with_layout,
       const std::string& opaque,
@@ -771,7 +772,7 @@ class XlaBuilder {
                const XlaComputation& computation,
                absl::Span<const int64_t> dimensions_to_reduce);
 
-  virtual StatusOr<XlaOp> ReduceInternal(
+  virtual absl::StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
       const XlaComputation& computation,
       absl::Span<const int64_t> dimensions_to_reduce);
@@ -798,7 +799,7 @@ class XlaBuilder {
       absl::Span<const int64_t> base_dilations,
       absl::Span<const int64_t> window_dilations,
       absl::Span<const std::pair<int64_t, int64_t>> padding);
-  StatusOr<HloInstructionProto> ReduceWindowInternal(
+  absl::StatusOr<HloInstructionProto> ReduceWindowInternal(
       absl::Span<const XlaOp> operands, absl::Span<const XlaOp> init_values,
       const XlaComputation& computation,
       absl::Span<const int64_t> window_dimensions,
@@ -806,7 +807,7 @@ class XlaBuilder {
       absl::Span<const int64_t> base_dilations,
       absl::Span<const int64_t> window_dilations,
       absl::Span<const std::pair<int64_t, int64_t>> padding);
-  virtual StatusOr<XlaOp> ReduceWindowInternal(
+  virtual absl::StatusOr<XlaOp> ReduceWindowInternal(
       const Shape& shape, XlaOp operand, XlaOp init_value,
       const XlaComputation& computation, Window window);
   XlaOp CrossReplicaSum(XlaOp operand,
@@ -875,7 +876,7 @@ class XlaBuilder {
       absl::Span<const std::pair<int64_t, int64_t>> padding, XlaOp source,
       XlaOp init_value, const XlaComputation& scatter);
 
-  StatusOr<HloInstructionProto> SelectAndScatterInternal(
+  absl::StatusOr<HloInstructionProto> SelectAndScatterInternal(
       XlaOp operand, const XlaComputation& select,
       absl::Span<const int64_t> window_dimensions,
       absl::Span<const int64_t> window_strides,
@@ -889,30 +890,30 @@ class XlaBuilder {
   XlaOp ConvertElementType(XlaOp operand, PrimitiveType new_element_type);
 
   XlaOp BitcastConvertType(XlaOp operand, PrimitiveType new_element_type);
-  virtual StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
-                                                     XlaOp operand);
+  virtual absl::StatusOr<XlaOp> BitcastConvertTypeInternal(const Shape& shape,
+                                                           XlaOp operand);
 
   XlaOp StochasticConvertType(XlaOp operand, XlaOp random,
                               PrimitiveType new_element_type);
 
   XlaOp Transpose(XlaOp operand, absl::Span<const int64_t> permutation);
-  virtual StatusOr<XlaOp> TransposeInternal(
+  virtual absl::StatusOr<XlaOp> TransposeInternal(
       const Shape& shape, XlaOp operand, absl::Span<const int64_t> permutation);
 
   XlaOp Rev(XlaOp operand, absl::Span<const int64_t> dimensions);
-  virtual StatusOr<XlaOp> RevInternal(const Shape& shape, XlaOp operand,
-                                      absl::Span<const int64_t> dimensions);
+  virtual absl::StatusOr<XlaOp> RevInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const int64_t> dimensions);
 
   XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
              int64_t dimension = -1, bool is_stable = false);
-  virtual StatusOr<XlaOp> SortInternal(const Shape& shape,
-                                       absl::Span<const XlaOp> operands,
-                                       const XlaComputation& comparator,
-                                       int64_t dimension, bool is_stable);
+  virtual absl::StatusOr<XlaOp> SortInternal(const Shape& shape,
+                                             absl::Span<const XlaOp> operands,
+                                             const XlaComputation& comparator,
+                                             int64_t dimension, bool is_stable);
 
   XlaOp TopK(XlaOp operand, int64_t k, bool largest);
-  virtual StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
-                                       int64_t k, bool largest);
+  virtual absl::StatusOr<XlaOp> TopKInternal(const Shape& shape, XlaOp operand,
+                                             int64_t k, bool largest);
 
   XlaOp Clamp(XlaOp min, XlaOp operand, XlaOp max);
 
@@ -928,15 +929,16 @@ class XlaBuilder {
                         const Shape& shape);
   // Internal variant for the op with the full result shape containing both data
   // and state shape as a tuple.
-  virtual StatusOr<XlaOp> RngBitGeneratorInternal(
+  virtual absl::StatusOr<XlaOp> RngBitGeneratorInternal(
       const Shape& full_result_shape, RandomAlgorithm algorithm,
       XlaOp initial_state);
 
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               XlaOp init);
-  virtual StatusOr<XlaOp> WhileInternal(const Shape& shape,
-                                        const XlaComputation& condition,
-                                        const XlaComputation& body, XlaOp init);
+  virtual absl::StatusOr<XlaOp> WhileInternal(const Shape& shape,
+                                              const XlaComputation& condition,
+                                              const XlaComputation& body,
+                                              XlaOp init);
 
   XlaOp Conditional(XlaOp predicate, XlaOp true_operand,
                     const XlaComputation& true_computation, XlaOp false_operand,
@@ -947,17 +949,17 @@ class XlaBuilder {
                     absl::Span<const XlaOp> branch_operands);
 
   XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
-  virtual StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
-                                                  XlaOp operand,
-                                                  int exponent_bits,
-                                                  int mantissa_bits);
+  virtual absl::StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
+                                                        XlaOp operand,
+                                                        int exponent_bits,
+                                                        int mantissa_bits);
 
   XlaOp Gather(XlaOp input, XlaOp start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64_t> slice_sizes,
                bool indices_are_sorted = false);
 
-  virtual StatusOr<XlaOp> GatherInternal(
+  virtual absl::StatusOr<XlaOp> GatherInternal(
       const Shape& shape, XlaOp input, XlaOp start_indices,
       const GatherDimensionNumbers& dimension_numbers,
       absl::Span<const int64_t> slice_sizes, bool indices_are_sorted);
@@ -972,7 +974,7 @@ class XlaBuilder {
                 const ScatterDimensionNumbers& dimension_numbers,
                 bool indices_are_sorted = false, bool unique_indices = false);
 
-  virtual StatusOr<XlaOp> ScatterInternal(
+  virtual absl::StatusOr<XlaOp> ScatterInternal(
       const Shape& shape, absl::Span<const XlaOp> inputs, XlaOp scatter_indices,
       absl::Span<const XlaOp> updates, const XlaComputation& update_computation,
       const ScatterDimensionNumbers& dimension_numbers, bool indices_are_sorted,
@@ -1010,28 +1012,29 @@ class XlaBuilder {
 
   XlaOp SetDimensionSize(XlaOp operand, XlaOp val, int64_t dimension);
 
-  virtual StatusOr<XlaOp> SetDimensionSizeInternal(const Shape& shape,
-                                                   XlaOp operand, XlaOp val,
-                                                   int64_t dimension);
+  virtual absl::StatusOr<XlaOp> SetDimensionSizeInternal(const Shape& shape,
+                                                         XlaOp operand,
+                                                         XlaOp val,
+                                                         int64_t dimension);
 
   XlaOp RemoveDynamicDimension(XlaOp operand, int64_t dimension);
 
-  virtual StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
-                                         HloOpcode opcode,
-                                         absl::Span<const XlaOp> operands);
-  StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
-                                 HloOpcode opcode) {
+  virtual absl::StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      absl::Span<const XlaOp> operands);
+  absl::StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr,
+                                       HloOpcode opcode) {
     return AddInstruction(std::move(instr), opcode, /*operands=*/{});
   }
 
   void AddCalledComputation(const XlaComputation& computation,
                             HloInstructionProto* instr);
 
-  StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
-  StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
+  absl::StatusOr<const HloInstructionProto*> LookUpInstruction(XlaOp op) const;
+  absl::StatusOr<const HloInstructionProto*> LookUpInstructionByHandle(
       int64_t handle) const;
-  StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
-  StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstruction(XlaOp op);
+  absl::StatusOr<HloInstructionProto*> LookUpMutableInstructionByHandle(
       int64_t handle);
 
   // Internal helper method that does the building for an arbitrary unary op.
@@ -1046,13 +1049,14 @@ class XlaBuilder {
                  std::optional<ComparisonDirection> direction = std::nullopt,
                  std::optional<Comparison::Type> type = std::nullopt);
 
-  StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                          ComparisonDirection direction);
+  absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
+                                ComparisonDirection direction);
 
   // Internal helper method for binary op compare without broadcast dimensions.
-  virtual StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
-                                  ComparisonDirection direction,
-                                  Comparison::Type type);
+  virtual absl::StatusOr<XlaOp> Compare(const Shape& shape, XlaOp lhs,
+                                        XlaOp rhs,
+                                        ComparisonDirection direction,
+                                        Comparison::Type type);
 
   // Internal helper method that does the building for an arbitrary binary op
   // with same ranked operands that doesn't broadcast.
@@ -1065,11 +1069,11 @@ class XlaBuilder {
   XlaOp RngOp(RandomDistribution distribution,
               absl::Span<const XlaOp> parameters, const Shape& shape);
 
-  virtual StatusOr<XlaOp> RngOpInternal(RandomDistribution distribution,
-                                        absl::Span<const XlaOp> parameters,
-                                        const Shape& shape);
+  virtual absl::StatusOr<XlaOp> RngOpInternal(
+      RandomDistribution distribution, absl::Span<const XlaOp> parameters,
+      const Shape& shape);
 
-  virtual StatusOr<XlaOp> InDimBroadcast(
+  virtual absl::StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64_t> broadcast_dimensions);
 
@@ -1078,8 +1082,8 @@ class XlaBuilder {
   // All dimensions of the operand must either be equal to the corresponding
   // output shape dimension, or be exactly 1.  (Such dimensions are the
   // degenerate dimensions.)
-  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
-                                       XlaOp operand);
+  absl::StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                             XlaOp operand);
 
   // Internal helper method that broadcasts a scalar to the shape of the output.
   absl::StatusOr<XlaOp> BroadcastScalarToOutputShape(XlaOp scalar,
@@ -1087,11 +1091,12 @@ class XlaBuilder {
 
   // Internal helper method for creating a Reshape op with the already inferred
   // shape.
-  virtual StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
-                                          int64_t inferred_dimension);
+  virtual absl::StatusOr<XlaOp> ReshapeInternal(const Shape& shape,
+                                                XlaOp operand,
+                                                int64_t inferred_dimension);
 
   // Returns the (inferred) result for the program shape using the given root.
-  StatusOr<ProgramShape> GetProgramShape(int64_t root_id) const;
+  absl::StatusOr<ProgramShape> GetProgramShape(int64_t root_id) const;
 
   // A visitor which checks whether an operation is a compile-time constant,
   // meaning that it doesn't depend on any parameters, or on any stateful
@@ -1264,7 +1269,7 @@ class XlaBuilder {
                           const DotDimensionNumbers& dimension_number,
                           const PrecisionConfig* precision_config,
                           std::optional<PrimitiveType> preferred_element_type);
-  virtual StatusOr<XlaOp> DotGeneralInternal(
+  virtual absl::StatusOr<XlaOp> DotGeneralInternal(
       const Shape& shape, XlaOp lhs, XlaOp rhs,
       const DotDimensionNumbers& dimension_number,
       const PrecisionConfig* precision_config);
@@ -1680,8 +1685,8 @@ class XlaBuilder {
       const std::optional<ChannelHandle>& channel_id = std::nullopt);
 
   // Creates an op with the given opcode and the output shape.
-  virtual StatusOr<XlaOp> AddOpWithShape(HloOpcode opcode, const Shape& shape,
-                                         absl::Span<const XlaOp> operands);
+  virtual absl::StatusOr<XlaOp> AddOpWithShape(
+      HloOpcode opcode, const Shape& shape, absl::Span<const XlaOp> operands);
 
   // Here, InstructionType is either const HloInstructionProto* or non-const
   // HloInstructionProto*.
@@ -3023,7 +3028,7 @@ XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
 // Switches from automatic SPMD partitioning to manual partitioning. Converts a
 // full-shaped tensor (to be automatically partitioned by SPMD partitioner) to a
 // shard-shaped tensor to be consumed by manually partitioned ops.
-StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
+absl::StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
     xla::XlaBuilder* builder, xla::XlaOp input, int single_dim,
     const xla::OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims);
@@ -3031,7 +3036,7 @@ StatusOr<xla::XlaOp> ConvertSpmdFullToShardShape(
 // Switches from manual partitioning to automatic SPMD partitioning. Converts a
 // shard-shaped tensor (manually partitioned in SPMD-style) to a full-shaped
 // tensor to be partitioned automatically by the SPMD partitioner.
-StatusOr<xla::XlaOp> ConvertSpmdShardToFullShape(
+absl::StatusOr<xla::XlaOp> ConvertSpmdShardToFullShape(
     xla::XlaBuilder* builder, xla::XlaOp input, const xla::Shape& output_shape,
     int single_dim, const xla::OpSharding& manual_sharding,
     absl::Span<const int64_t> unspecified_dims);
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index ee9d100a3a1bf2..665f932b78e537 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/client/xla_builder.h"
 
 #include <algorithm>
+#include <array>
 #include <complex>
 #include <cstdint>
 #include <functional>
@@ -27,24 +28,31 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/client/padding.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/client/value_inference.h"
 #include "xla/client/xla_computation.h"
+#include "xla/comparison_util.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout_util.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
@@ -69,7 +77,7 @@ HloInstruction* GetRoot(HloModule& module) {
 }
 
 // TODO(b/74197823): Move the tests to service/.
-StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
+absl::StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       b.Build(/*remove_dynamic_dimensions=*/false));
   const HloModuleProto& proto = computation.proto();
@@ -80,7 +88,8 @@ StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b) {
 }
 
 // Overload which explicitly specifies the root instruction.
-StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b, XlaOp root) {
+absl::StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder& b,
+                                                          XlaOp root) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       b.Build(root, /*remove_dynamic_dimensions=*/false));
   const HloModuleProto& proto = computation.proto();
@@ -98,7 +107,7 @@ std::string TestName() {
 TEST(XlaBuilderTest, OnePlusTwo) {
   XlaBuilder b(TestName());
   Add(ConstantR0<float>(&b, 1.0), ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
@@ -108,7 +117,7 @@ TEST(XlaBuilderTest, UnaryOperatorsBuildExpectedHLO) {
                                  auto matches_pattern) {
     XlaBuilder b(TestName());
     op(ConstantR0<int32_t>(&b, 1));
-    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
     auto root = module->entry_computation()->root_instruction();
     EXPECT_THAT(root, matches_pattern);
   };
@@ -123,7 +132,7 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
                                   auto matches_pattern) {
     XlaBuilder b(TestName());
     op(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
-    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+    TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
     auto root = module->entry_computation()->root_instruction();
     EXPECT_THAT(root, matches_pattern);
   };
@@ -153,7 +162,7 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
       [&](std::function<XlaOp(XlaOp, XlaOp)> op, auto matches_pattern) {
         XlaBuilder b(TestName());
         op(ConstantR0<uint32_t>(&b, 1), ConstantR0<uint32_t>(&b, 2));
-        TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+        TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
         auto root = module->entry_computation()->root_instruction();
         EXPECT_THAT(root, matches_pattern);
       };
@@ -164,10 +173,10 @@ TEST(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
 
 TEST(XlaBuilderTest, VariadicAnd) {
   XlaBuilder b(TestName());
-  Shape s = ShapeUtil::MakeShape(PRED, {});
+  const Shape s = ShapeUtil::MakeShape(PRED, {});
   And(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
       Parameter(&b, 2, s, "p2"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   // Don't specify in the test whether And(x, y, z) is right- or
   // left-associative; accept either one.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -180,10 +189,10 @@ TEST(XlaBuilderTest, VariadicAnd) {
 
 TEST(XlaBuilderTest, VariadicOr) {
   XlaBuilder b(TestName());
-  Shape s = ShapeUtil::MakeShape(PRED, {});
+  const Shape s = ShapeUtil::MakeShape(PRED, {});
   Or(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
      Parameter(&b, 2, s, "p2"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   // Don't specify in the test whether Or(x, y, z) is right- or
   // left-associative; accept either one.
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -208,7 +217,7 @@ TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
   Add(x, ConstantR0<float>(&b, 1.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               GmockMatch(m::Add(m::Parameter(), m::Broadcast(m::Constant()))));
@@ -216,9 +225,9 @@ TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
 
 TEST(XlaBuilderTest, ParamPlusConstantHasScalarBroadcastReversed) {
   XlaBuilder b(TestName());
-  XlaOp x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
+  const XlaOp x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {3, 5}), "x");
   Add(ConstantR0<float>(&b, 1.0), x);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               GmockMatch(m::Add(m::Broadcast(m::Constant()), m::Parameter())));
@@ -232,10 +241,10 @@ TEST(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   auto y = Parameter(&b, 1, y_shape, "y");
   auto add = Add(x, y, /*broadcast_dimensions=*/{0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add));
+  TF_ASSERT_OK_AND_ASSIGN(const auto add_shape, b.GetShape(add));
   EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root, GmockMatch(m::Add(m::Parameter(0), m::Broadcast(m::Parameter(1)))));
@@ -245,31 +254,31 @@ TEST(XlaBuilderTest, XPlusX) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(S32, {1, 3, 5, 7}), "x");
   Add(x, x);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Parameter(0))));
 }
 
 TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,2]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcastBounded) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[<=2, <=2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, <=2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[<=2, <=2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, <=2]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -314,14 +323,14 @@ TEST(XlaBuilderTest, Call) {
   auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0");
   auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1");
   Add(p0, p1);
-  TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto call, b_call.Build());
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
   auto one = ConstantR0<float>(&b, 1);
   auto two = ConstantR0<float>(&b, 2);
   Add(Call(&b, call, {x, y}), Call(&b, call, {one, two}));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Call(m::Parameter(), m::Parameter()),
                                       m::Call(m::Constant(), m::Constant()))));
@@ -332,7 +341,7 @@ TEST(XlaBuilderTest, BinopHasDegenerateBroadcast) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {1, 2, 3}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {1, 2, 1}), "y");
   Add(x, y);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   // Expected:
   //
@@ -354,7 +363,7 @@ TEST(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
   auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {2, 1, 4}), "y");
   Add(x, y, /*broadcast_dimensions=*/{0, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   // The binary operation has in-dim broadcast and degenerate broadcast, should
   // first do the in-dim broadcast then convert the degenerate broadcast into a
@@ -380,7 +389,7 @@ TEST(XlaBuilderTest, BroadcastInDim) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
   BroadcastInDim(x, {2, 4, 3},
                  /*broadcast_dimensions=*/{0, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Broadcast()));
 }
@@ -390,7 +399,7 @@ TEST(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
   BroadcastInDim(x, {2, 3, 4},
                  /*broadcast_dimensions=*/{0, 1, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Broadcast(m::Reshape(m::Broadcast()))));
 }
@@ -423,7 +432,7 @@ TEST(XlaBuilderTest, ReshapeDefaultOrder) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
   Reshape(x, /*new_sizes=*/{6, 35});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Reshape(m::Parameter())));
 }
@@ -432,7 +441,7 @@ TEST(XlaBuilderTest, ReshapeHasTranspose) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3, 5, 7}), "x");
   Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Reshape(m::Transpose(m::Parameter()))));
 }
@@ -441,7 +450,7 @@ TEST(XlaBuilderTest, Transpose) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
   Transpose(x, /*permutation=*/{1, 0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Transpose(m::Parameter())));
 }
@@ -450,7 +459,7 @@ TEST(XlaBuilderTest, AllGatherR1) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4}), "x");
   AllGather(x, /*all_gather_dimension=*/0, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -461,7 +470,7 @@ TEST(XlaBuilderTest, AllGatherR2) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
   AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -475,7 +484,7 @@ TEST(XlaBuilderTest, AllGatherWithTuple) {
   auto x2 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {16, 4}), "x2");
   AllGather(Tuple(&b, {x, x2}), /*all_gather_dimension=*/0,
             /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
@@ -490,7 +499,7 @@ TEST(XlaBuilderTest, AllGatherTuple) {
   auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {128, 4}), "p0");
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {128, 8}), "p1");
   AllGatherTuple({p0, p1}, /*all_gather_dimension=*/1, /*shard_count=*/4);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   auto tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {128, 16}),
@@ -518,7 +527,7 @@ TEST(XlaBuilderTest, ReduceScatter) {
   group.add_replica_ids(1);
   ReduceScatter(x, to_apply, /*scatter_dimension=*/1, /*shard_count=*/2,
                 /*replica_groups=*/{group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kReduceScatter);
@@ -546,7 +555,7 @@ TEST(XlaBuilderTest, ReduceScatterWithTuple) {
   ReduceScatter(Tuple(&b, {x, x2}), to_apply, /*scatter_dimension=*/1,
                 /*shard_count=*/2,
                 /*replica_groups=*/{group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   EXPECT_EQ(root->opcode(), HloOpcode::kReduceScatter);
@@ -561,7 +570,7 @@ TEST(XlaBuilderTest, AllToAll) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
   AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0,
            /*split_count=*/2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // AllToAll is decomposed into slices -> all-to-all -> gte -> concat.
@@ -578,7 +587,7 @@ TEST(XlaBuilderTest, AllToAllSpecial) {
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16, 8}), "x");
   AllToAll(x, /*split_dimension=*/0, /*concat_dimension=*/0,
            /*split_count=*/2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // AllToAll is converted into a single all-to-all HloInstruction.
@@ -596,7 +605,7 @@ TEST(XlaBuilderTest, AllToAllTuple) {
   replica_group.add_replica_ids(1);
 
   AllToAllTuple({p0, p1}, {replica_group}, LayoutUtil::MakeAscendingLayout(2));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // Check shape and replica groups.
@@ -628,10 +637,10 @@ TEST(XlaBuilderTest, AllReduceTuple) {
   XlaBuilder bsum(TestName());
   auto f32Scalar = ShapeUtil::MakeShape(F32, {});
   Add(Parameter(&bsum, 0, f32Scalar, "x"), Parameter(&bsum, 1, f32Scalar, "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
 
   AllReduceTuple({p0, p1}, sum);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
 
   // Check shape and replica groups.
@@ -650,7 +659,7 @@ TEST(XlaBuilderTest, CollectiveBroadcast) {
   replica_group.add_replica_ids(0);
   replica_group.add_replica_ids(1);
   CollectiveBroadcast(x, {replica_group});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectiveBroadcast);
 }
@@ -659,7 +668,7 @@ TEST(XlaBuilderTest, CollectivePermute) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
   CollectivePermute(x, {{0, 1}, {1, 2}, {2, 3}});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute);
 }
@@ -669,7 +678,7 @@ TEST(XlaBuilderTest, GetDimensionSize) {
   auto x =
       Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
   GetDimensionSize(x, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
 }
@@ -680,7 +689,7 @@ TEST(XlaBuilderTest, GetDimensionSizeConstant) {
       Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
   // Get dimension size from a constant dimension gives us a constant.
   GetDimensionSize(x, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
 }
@@ -696,16 +705,16 @@ TEST(XlaBuilderTest, ReportError) {
 
 TEST(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
   XlaBuilder b(TestName());
-  StatusOr<XlaOp> op(ConstantR0<float>(&b, 1.0));
+  absl::StatusOr<XlaOp> op(ConstantR0<float>(&b, 1.0));
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
 
 TEST(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
   XlaBuilder b(TestName());
-  StatusOr<XlaOp> op(InvalidArgument("a test error"));
+  absl::StatusOr<XlaOp> op(InvalidArgument("a test error"));
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
   auto statusor = b.Build();
   ASSERT_FALSE(statusor.ok());
@@ -714,9 +723,10 @@ TEST(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
 
 TEST(XlaBuilderTest, BuildWithSpecificRoot) {
   XlaBuilder b(TestName());
-  XlaOp constant = ConstantR0<float>(&b, 1.0);
+  const XlaOp constant = ConstantR0<float>(&b, 1.0);
   Add(constant, ConstantR0<float>(&b, 2.0));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/constant));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
+                          BuildHloModule(b, /*root=*/constant));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
@@ -726,11 +736,11 @@ TEST(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) {
   // parameters.
   XlaBuilder b(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
-  XlaOp x = Parameter(&b, 0, shape, "x");
-  XlaOp y = Parameter(&b, 1, shape, "y");
-  XlaOp z = Parameter(&b, 2, shape, "z");
+  const XlaOp x = Parameter(&b, 0, shape, "x");
+  const XlaOp y = Parameter(&b, 1, shape, "y");
+  const XlaOp z = Parameter(&b, 2, shape, "z");
   Add(x, Sub(y, z));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/x));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, /*root=*/x));
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Parameter()));
   EXPECT_EQ(module->entry_computation()->num_parameters(), 3);
@@ -743,7 +753,7 @@ TEST(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
   const Shape shape = ShapeUtil::MakeShape(F32, {42, 123});
 
   Parameter(&b, 0, shape, "param");
-  XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
+  const XlaOp other_param = Parameter(&other_b, 0, shape, "other_param");
 
   Status status = b.Build(other_param).status();
   ASSERT_IS_NOT_OK(status);
@@ -761,7 +771,7 @@ TEST(XlaBuilderTest, ProtoMatches) {
     auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0");
     auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1");
     Add(p0, Add(p1, p0));
-    TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+    TF_ASSERT_OK_AND_ASSIGN(const auto call, b_call.Build());
     XlaBuilder b(TestName());
     auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
@@ -777,11 +787,11 @@ TEST(XlaBuilderTest, ProtoMatches) {
 
 TEST(XlaBuilderTest, DynamicParameter) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6}, {true})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, /*root=*/p0));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, /*root=*/p0));
   const Shape& param_shape = module->entry_computation()
                                  ->parameter_instruction(0)
                                  ->shape()
@@ -794,7 +804,7 @@ TEST(XlaBuilderTest, SetDimensionSize) {
   auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10}), "p0");
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
   auto set_dim_size = SetDimensionSize(p0, p1, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/set_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -807,7 +817,7 @@ TEST(XlaBuilderTest, RemoveDynamicDimension) {
   auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "p1");
   auto set_dim_size = SetDimensionSize(p0, p1, 0);
   auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/remove_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -823,7 +833,7 @@ TEST(XlaBuilderTest, RemoveDynamicDimensionMultiDims) {
   set_dim_size = SetDimensionSize(set_dim_size, p1, 1);
   auto remove_dim_size = RemoveDynamicDimension(set_dim_size, 0);
   remove_dim_size = RemoveDynamicDimension(remove_dim_size, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
+  TF_ASSERT_OK_AND_ASSIGN(const auto module,
                           BuildHloModule(b, /*root=*/remove_dim_size));
   const Shape& root_shape =
       module->entry_computation()->root_instruction()->shape();
@@ -834,12 +844,12 @@ TEST(XlaBuilderTest, RemoveDynamicDimensionMultiDims) {
 
 TEST(XlaBuilderTest, DynamicUnary) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   Neg(gte);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
@@ -847,14 +857,14 @@ TEST(XlaBuilderTest, DynamicUnary) {
 
 TEST(XlaBuilderTest, DynamicBinary) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}, {true}),
        ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
@@ -862,14 +872,14 @@ TEST(XlaBuilderTest, DynamicBinary) {
 
 TEST(XlaBuilderTest, DynamicBinaryHasBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(F32, {5}, {true}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1, {0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -878,14 +888,14 @@ TEST(XlaBuilderTest, DynamicBinaryHasBroadcast) {
 
 TEST(XlaBuilderTest, DynamicBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   BroadcastInDim(gte, /*out_dim_size=*/{3, 5, 4},
                  /*broadcast_dimensions=*/{1, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -895,14 +905,14 @@ TEST(XlaBuilderTest, DynamicBroadcast) {
 
 TEST(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {10}, {true}),
        ShapeUtil::MakeShape(F32, {1, 15}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Add(gte0, gte1, /*broadcast_dimensions=*/{0});  // f32[<=10, 15]
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -911,7 +921,7 @@ TEST(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
 
 TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {10}, {true}),
        ShapeUtil::MakeShape(F32, {10}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -920,7 +930,7 @@ TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
 
   Select(gte0, gte1, gte1);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true}))
@@ -929,16 +939,17 @@ TEST(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
 
 TEST(XlaBuilderTest, SelectIntoConditional) {
   XlaBuilder b(TestName());
-  Shape selector_shape = ShapeUtil::MakeShape(PRED, {});
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape selector_shape = ShapeUtil::MakeShape(PRED, {});
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {})});
-  XlaOp p0 = Parameter(&b, 0, selector_shape, "p0");
-  XlaOp p1 = Parameter(&b, 1, tuple_param_shape, "p1");
-  XlaOp p2 = Parameter(&b, 2, tuple_param_shape, "p2");
+  const XlaOp p0 = Parameter(&b, 0, selector_shape, "p0");
+  const XlaOp p1 = Parameter(&b, 1, tuple_param_shape, "p1");
+  const XlaOp p2 = Parameter(&b, 2, tuple_param_shape, "p2");
 
   Select(p0, p1, p2);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<HloModule> module,
+                          BuildHloModule(b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Conditional(m::Parameter(0), m::Parameter(1),
                                         m::Parameter(2))));
@@ -956,7 +967,7 @@ TEST(XlaBuilderTest, SelectIntoConditional) {
 
 TEST(XlaBuilderTest, DynamicPad) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -970,7 +981,7 @@ TEST(XlaBuilderTest, DynamicPad) {
     dimension->set_interior_padding(0);
   }
   Pad(gte, pad_val, padding_config);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -979,7 +990,7 @@ TEST(XlaBuilderTest, DynamicPad) {
 
 TEST(XlaBuilderTest, DynamicConvolution) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}, {true, false, false, false}),
        ShapeUtil::MakeShape(F32, {2, 2, 128, 8}, {false, false, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1001,7 +1012,7 @@ TEST(XlaBuilderTest, DynamicConvolution) {
   dnums.set_kernel_output_feature_dimension(3);
   ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
                             /*feature_group_count=*/1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
@@ -1011,7 +1022,7 @@ TEST(XlaBuilderTest, DynamicConvolution) {
 
 TEST(XlaBuilderTest, DynamicDot) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 3, 4}, {true, true, false}),
        ShapeUtil::MakeShape(F32, {2, 4, 5}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1025,7 +1036,7 @@ TEST(XlaBuilderTest, DynamicDot) {
   dnums.add_lhs_batch_dimensions(0);
   dnums.add_rhs_batch_dimensions(0);
   DotGeneral(lhs, rhs, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -1035,7 +1046,7 @@ TEST(XlaBuilderTest, DynamicDot) {
 
 TEST(XlaBuilderTest, DynamicReduce) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5, 4, 3}, {false, true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1044,9 +1055,9 @@ TEST(XlaBuilderTest, DynamicReduce) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   Reduce(gte, init, sum, {0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
@@ -1055,7 +1066,7 @@ TEST(XlaBuilderTest, DynamicReduce) {
 
 TEST(XlaBuilderTest, DynamicReduceWindow) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1064,10 +1075,10 @@ TEST(XlaBuilderTest, DynamicReduceWindow) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   VLOG(2) << module->entry_computation()->root_instruction()->ToString()
           << "\n";
   const Shape& result_shape =
@@ -1079,7 +1090,7 @@ TEST(XlaBuilderTest, DynamicReduceWindow) {
 
 TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
@@ -1094,12 +1105,12 @@ TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
   auto p5 = Parameter(&bsum, 3, ShapeUtil::MakeShape(F32, {}), "y1");
   std::vector<XlaOp> output_operands = {Add(p2, p4), Add(p3, p5)};
   Tuple(&bsum, absl::MakeSpan(output_operands));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   auto init = ConstantR0<float>(&b, 0.f);
   ReduceWindow(input_operands, {init, init}, sum,
                /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   VLOG(2) << module->entry_computation()->root_instruction()->ToString()
           << "\n";
   const Shape& result_shape =
@@ -1114,7 +1125,7 @@ TEST(XlaBuilderTest, VariadicDynamicReduceWindow) {
 
 TEST(XlaBuilderTest, DynamicSelectAndScatter) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 4, 8}, {true, false, false}),
        ShapeUtil::MakeShape(F32, {2, 2, 2}, {true, false, false}),
        ShapeUtil::MakeShape(U32, {})});
@@ -1123,17 +1134,17 @@ TEST(XlaBuilderTest, DynamicSelectAndScatter) {
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto sum, bsum.Build());
   XlaBuilder bge(TestName());
   Ge(Parameter(&bge, 0, ShapeUtil::MakeShape(F32, {}), "x"),
      Parameter(&bge, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(auto ge, bge.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const auto ge, bge.Build());
 
   auto gte0 = GetTupleElement(p0, 0);
   auto source = GetTupleElement(p0, 1);
   SelectAndScatter(gte0, ge, {1, 2, 4}, {1, 2, 4}, Padding::kValid, source,
                    init, sum);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(
@@ -1143,14 +1154,14 @@ TEST(XlaBuilderTest, DynamicSelectAndScatter) {
 
 TEST(XlaBuilderTest, DynamicReshape) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6},
                             {false, false, true, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);  // f32[2, 3, <=4, <=5, 6]
   Reshape(gte, /*new_sizes=*/{6, 4, 5, 2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
@@ -1162,7 +1173,7 @@ TEST(XlaBuilderTest, DynamicReshape) {
 
 TEST(XlaBuilderTest, DynamicSelect) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1171,7 +1182,7 @@ TEST(XlaBuilderTest, DynamicSelect) {
   auto gte0 = GetTupleElement(p0, 0);
   auto gte1 = GetTupleElement(p0, 1);
   Select(pred, gte0, gte1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
@@ -1183,7 +1194,7 @@ TEST(XlaBuilderTest, DynamicSelect) {
 
 TEST(XlaBuilderTest, DynamicSelectNotCompatible) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, true, false}),
        ShapeUtil::MakeShape(F32, {4, 5, 6}, {false, false, true}),
        ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
@@ -1198,13 +1209,13 @@ TEST(XlaBuilderTest, DynamicSelectNotCompatible) {
 
 TEST(XlaBuilderTest, DynamicTranspose) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 5}, {true, false}),
        ShapeUtil::MakeShape(U32, {})});
   auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
   auto gte = GetTupleElement(p0, 0);
   Transpose(gte, /*permutation=*/{1, 0});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {false, true}))
@@ -1213,8 +1224,8 @@ TEST(XlaBuilderTest, DynamicTranspose) {
 
 TEST(XlaBuilderTest, DotWithPreferredElementType) {
   XlaBuilder b(TestName());
-  Shape p0_shape = ShapeUtil::MakeShape(U8, {2, 3});
-  Shape p1_shape = ShapeUtil::MakeShape(U16, {3, 2});
+  const Shape p0_shape = ShapeUtil::MakeShape(U8, {2, 3});
+  const Shape p1_shape = ShapeUtil::MakeShape(U16, {3, 2});
   auto p0 = Parameter(&b, 0, p0_shape, "p0");
   auto p1 = Parameter(&b, 1, p1_shape, "p1");
 
@@ -1223,7 +1234,7 @@ TEST(XlaBuilderTest, DotWithPreferredElementType) {
   dnums.add_rhs_contracting_dimensions(0);
   DotGeneral(p0, p1, dnums, /*precision_config=*/nullptr,
              /*preferred_element_type=*/U32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   ASSERT_TRUE(
@@ -1249,16 +1260,16 @@ TEST(XlaBuilderTest, SparseDot) {
   std::vector<XlaOp> sparse_meta = {meta};
 
   SparseDot(lhs, rhs, sparse_meta, sparsity, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[10, 20]"));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[10, 20]"));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, ConvolutionWithPreferredElementType) {
   XlaBuilder b(TestName());
-  Shape p0_shape = ShapeUtil::MakeShape(S16, {1, 2, 2, 128});
-  Shape p1_shape = ShapeUtil::MakeShape(S8, {2, 2, 128, 8});
+  const Shape p0_shape = ShapeUtil::MakeShape(S16, {1, 2, 2, 128});
+  const Shape p1_shape = ShapeUtil::MakeShape(S8, {2, 2, 128, 8});
   auto p0 = Parameter(&b, 0, p0_shape, "p0");
   auto p1 = Parameter(&b, 1, p1_shape, "p1");
 
@@ -1279,7 +1290,7 @@ TEST(XlaBuilderTest, ConvolutionWithPreferredElementType) {
                             /*feature_group_count=*/1, /*batch_group_count=*/1,
                             /*precision_config=*/nullptr,
                             /*preferred_element_type=*/S32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const Shape& result_shape =
       module->entry_computation()->root_instruction()->shape();
   ASSERT_TRUE(
@@ -1306,7 +1317,7 @@ TEST(XlaBuilderTest, CheckInputOutputAlias) {
   b.SetUpAlias({1}, 0, {});
   b.SetUpAlias({0}, 1, {});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloInputOutputAliasConfig& config = module->input_output_alias_config();
   EXPECT_TRUE(config.ParameterHasAlias(0, {}));
@@ -1331,7 +1342,7 @@ TEST(XlaBuilderTest, CheckBufferDonor) {
 
   b.AddBufferDonor(0, {});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloBufferDonorConfig& config = module->buffer_donor_config();
   EXPECT_TRUE(config.ParameterIsBufferDonor(0, {}));
@@ -1368,7 +1379,7 @@ TEST(XlaBuilderTest, ValidInputOutputAliasBufferDonor) {
 
   b.SetUpAlias({1}, 0, {});
   b.AddBufferDonor(1, {});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b, root));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b, root));
 
   const HloInputOutputAliasConfig& io_alias_config =
       module->input_output_alias_config();
@@ -1420,7 +1431,7 @@ TEST(XlaBuilderTest, SimpleSetFrontendAttributes) {
   b.ClearFrontendAttributes();
   ConstantR0(&b, 0);  // No attribute set
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
 
   std::vector<FrontendAttributes> expected{FrontendAttributes(), attributes,
                                            FrontendAttributes()};
@@ -1462,7 +1473,7 @@ TEST(XlaBuilderTest, ComplexSetFrontendAttributes) {
   ConstantR0(&b, 0);  // No attribute set
   expected.push_back(FrontendAttributes());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
 
@@ -1528,14 +1539,14 @@ TEST(XlaBuilderTest, AddFrontendAttribute) {
   ConstantR0(&b, 0);  // No attribute set
   expected.push_back(FrontendAttributes());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
 
 TEST(XlaBuilderTest, ComparisonType) {
   XlaBuilder b(TestName());
   (void)Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto root = module->entry_computation()->root_instruction();
   ASSERT_THAT(root, GmockMatch(m::Compare(m::Constant(), m::Constant())));
   EXPECT_EQ(Comparison::Type::kSigned,
@@ -1545,7 +1556,7 @@ TEST(XlaBuilderTest, ComparisonType) {
 TEST(XlaBuilderTest, StableLookUpInstructionByHandle) {
   XlaBuilder b(TestName());
   internal::XlaBuilderFriend builder_friend;
-  XlaOp le = Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
+  const XlaOp le = Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
   HloInstructionProto* first_op = builder_friend.GetInstruction(le);
   // Create some more instructions.
   for (int i = 0; i < 100; ++i) {
@@ -1558,10 +1569,10 @@ TEST(XlaBuilderTest, StableLookUpInstructionByHandle) {
 
 TEST(XlaBuilderTest, ComplexAbsConstant) {
   XlaBuilder b(TestName());
-  XlaOp out =
+  const XlaOp out =
       Abs(ConstantR0<std::complex<float>>(&b, std::complex<float>{-1, -1}));
   ValueInference value_inference(&b);
-  StatusOr<OptionalLiteral> analyzed =
+  absl::StatusOr<OptionalLiteral> analyzed =
       value_inference.AnalyzeConstant(out, kUpperBound);
   EXPECT_IS_OK(analyzed.status());
   EXPECT_EQ(analyzed->GetValue().value().shape().element_type(),
@@ -1570,22 +1581,24 @@ TEST(XlaBuilderTest, ComplexAbsConstant) {
 
 TEST(XlaBuilderTest, OutfeedDummyTupleSharding) {
   XlaBuilder b(TestName());
-  XlaOp value = ConstantR1<int32_t>(&b, {0});
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
-                                                    /* minor_to_major= */ {0});
+  const XlaOp value = ConstantR1<int32_t>(&b, {0});
+  const Shape shape =
+      ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                          /* minor_to_major= */ {0});
   Outfeed(value, shape, "");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_FALSE(module->entry_computation()->root_instruction()->has_sharding());
 }
 
 TEST(XlaBuilderTest, OutfeedTokenSharding) {
   XlaBuilder b(TestName());
-  XlaOp value = ConstantR1<int32_t>(&b, {0});
-  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
-                                                    /* minor_to_major= */ {0});
+  const XlaOp value = ConstantR1<int32_t>(&b, {0});
+  const Shape shape =
+      ShapeUtil::MakeShapeWithDenseLayout(S32, /* dimensions= */ {1},
+                                          /* minor_to_major= */ {0});
   b.SetSharding(sharding_builder::Replicate());
   Outfeed(value, shape, "");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   auto it = std::find_if(module->entry_computation()->instructions().begin(),
                          module->entry_computation()->instructions().end(),
                          HloPredicateIsOp<HloOpcode::kOutfeed>);
@@ -1603,11 +1616,11 @@ TEST(XlaBuilderTest, OutfeedTokenSharding) {
 
 TEST(XlaBuilderTest, NormalizeTupleSharding) {
   XlaBuilder b(TestName());
-  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
   b.SetSharding(sharding_builder::Replicate());
   Parameter(&b, 0, tuple_param_shape, "p0");
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_TRUE(root->has_sharding());
   EXPECT_TRUE(root->sharding().IsTuple());
@@ -1616,8 +1629,8 @@ TEST(XlaBuilderTest, NormalizeTupleSharding) {
 
 TEST(XlaBuilderTest, InvalidSharding) {
   XlaBuilder b(TestName());
-  Shape shape2d = ShapeUtil::MakeShape(F32, {6, 8});
-  Shape shape1d = ShapeUtil::MakeShape(F32, {5});
+  const Shape shape2d = ShapeUtil::MakeShape(F32, {6, 8});
+  const Shape shape1d = ShapeUtil::MakeShape(F32, {5});
   b.SetSharding(sharding_builder::Tile1D(shape1d, 4));
   Parameter(&b, 0, shape2d, "p0");
   auto statusor = b.Build();
@@ -1633,7 +1646,7 @@ TEST(XlaBuilderTest, TopKDimensions) {
   int64_t largest = true;
   TopK(Parameter(&b, 0, ShapeUtil::MakeShape(F32, {6, 8}), "p0"), k, largest);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_TRUE(root->opcode() == HloOpcode::kTopK);
   EXPECT_TRUE(root->shape().IsTuple());
@@ -1652,15 +1665,15 @@ TEST(XlaBuilderTest, TopKDimensions) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[1, 2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1669,15 +1682,15 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimExportSuccess) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2, 2, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1686,15 +1699,15 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimNonBroadcastDimSizeGreaterThanOne) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[1, 2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, 2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[1, 2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, 2, ?]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
       /*broadcast_dimensions=*/{1, 2}, output_shape);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(module->ToString(), HasSubstr("mhlo.dynamic_broadcast_in_dim"));
   EXPECT_THAT(module->ToString(), HasSubstr("broadcast_dimensions=[1,2]"));
   EXPECT_THAT(GetRoot(*module),
@@ -1703,9 +1716,9 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimDynamicResultSize) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("f32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("f32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1718,9 +1731,10 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsElementType) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions,
+                          ParseShape("s32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1733,9 +1747,9 @@ TEST(XlaBuilderTest, DynamicBroadcastInDimInvalidOutputDimensionsRank) {
 
 TEST(XlaBuilderTest, DynamicBroadcastInDimIncompatibleBroadcastSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_dimensions, ParseShape("s32[3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output_shape, ParseShape("f32[2, 3, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_dimensions, ParseShape("s32[3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output_shape, ParseShape("f32[2, 3, 3]"));
   DynamicBroadcastInDim(
       Parameter(&b, 0, operand, "operand"),
       Parameter(&b, 1, output_dimensions, "output_dimensions"),
@@ -1768,6 +1782,7 @@ struct BinaryOpTestCase {
 constexpr absl::string_view kBroadcastDimensionMismatch =
     "Broadcast dimension 0 mismatch: 2 != -9223372036854775808; f32[2] and "
     "f32[?,10].";
+std::array<const int64_t, 0> empty_array = {};
 std::array<const int64_t, 1> zero_array = {0};
 
 class XlaBuilderUnboundedUnaryOpTest
@@ -1778,24 +1793,26 @@ class XlaBuilderUnboundedBinaryOpTest
 
 TEST_P(XlaBuilderUnboundedUnaryOpTest, UnboundedUnaryOpTest) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape(GetParam().operand));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape(GetParam().operand));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   GetParam().unary_op(Parameter(&b, 0, operand, "operand"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST_P(XlaBuilderUnboundedBinaryOpTest, UnboundedBinaryOpTest) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   GetParam().binary_op(Parameter(&b, 0, lhs, "lhs"),
                        Parameter(&b, 1, rhs, "rhs"),
                        GetParam().broadcast_dimensions);
   if (auto result = BuildHloModule(b); result.ok()) {
-    std::unique_ptr<HloModule> module = std::move(*result);
+    const std::unique_ptr<HloModule> module = std::move(*result);
     EXPECT_THAT(GetRoot(*module),
                 GmockMatch(m::Op().WithShapeEqualTo(&expected)));
   } else {
@@ -1806,110 +1823,123 @@ TEST_P(XlaBuilderUnboundedBinaryOpTest, UnboundedBinaryOpTest) {
 
 TEST(XlaBuilderTest, UnboundedAddScalarBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
-      /*broadcast_dimensions=*/{});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+      /*broadcast_dimensions=*/empty_array);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedAddDegenerateBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/{0, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedAddUnsupportedImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
-      /*broadcast_dimensions=*/{0});
+      /*broadcast_dimensions=*/zero_array);
   EXPECT_THAT(BuildHloModule(b),
               StatusIs(_, HasSubstr(kBroadcastDimensionMismatch)));
 }
 
 TEST(XlaBuilderTest, UnboundedAnd) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
   And(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
       /*broadcast_dimensions=*/absl::Span<const int64_t>{});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormGrad) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_scale, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_offset, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_output, ParseShape("f32[5, ?, 7]"));
-  Shape expected =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_scale, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_offset, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_output, ParseShape("f32[5, ?, 7]"));
+  const Shape expected =
       ShapeUtil::MakeTupleShape({grad_operand, grad_scale, grad_offset});
   BatchNormGrad(
       Parameter(&b, 0, operand, "operand"), Parameter(&b, 1, scale, "scale"),
       Parameter(&b, 2, mean, "mean"), Parameter(&b, 3, variance, "variance"),
       Parameter(&b, 4, grad_output, "grad_output"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormInference) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[5]"));
   BatchNormInference(
       Parameter(&b, 0, operand, "operand"), Parameter(&b, 1, scale, "scale"),
       Parameter(&b, 2, offset, "offset"), Parameter(&b, 3, mean, "mean"),
       Parameter(&b, 4, variance, "variance"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBatchNormTraining) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_var, ParseShape("f32[?]"));
-  Shape expected = ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_var, ParseShape("f32[?]"));
+  const Shape expected =
+      ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
   BatchNormTraining(Parameter(&b, 0, operand, "operand"),
                     Parameter(&b, 1, scale, "scale"),
                     Parameter(&b, 2, offset, "offset"), 1.0, 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedBitcastConvert) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f16[?, 10, 2]"));
+  BitcastConvertType(Parameter(&b, 0, operand, "operand"), PrimitiveType::F16);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedOperand) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=3, ?]"));
   Broadcast(Parameter(&b, 0, operand, "operand"), /*broadcast_sizes=*/{1});
   EXPECT_THAT(BuildHloModule(b),
               StatusIs(_, HasSubstr("is_unbounded_dynamic")));
@@ -1917,7 +1947,7 @@ TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedOperand) {
 
 TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedBroadcastSize) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1]"));
   Broadcast(Parameter(&b, 0, operand, "operand"),
             /*broadcast_sizes=*/{Shape::kUnboundedSize});
   EXPECT_THAT(
@@ -1927,19 +1957,19 @@ TEST(XlaBuilderTest, UnboundedBroadcastUnsupportedBroadcastSize) {
 
 TEST(XlaBuilderTest, UnboundedBroadcastInDim) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, 4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, 4]"));
   BroadcastInDim(Parameter(&b, 0, operand, "operand"),
                  /*out_dim_size=*/{2, 3, 4},
                  /*broadcast_dimensions=*/{0, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedBroadcastInDimUnsupported) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=3, ?]"));
   BroadcastInDim(Parameter(&b, 0, operand, "operand"),
                  /*out_dim_size=*/{2, 3, Shape::kUnboundedSize},
                  /*broadcast_dimensions=*/{0, 2});
@@ -1950,66 +1980,69 @@ TEST(XlaBuilderTest, UnboundedBroadcastInDimUnsupported) {
 
 TEST(XlaBuilderTest, UnboundedClamp) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinMaxImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarOperandMaxImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedClampScalarMinOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2017,9 +2050,9 @@ TEST(XlaBuilderTest, UnboundedClampScalarMinOperandImplicitBroadcast) {
 TEST(XlaBuilderTest,
      UnboundedClampUnsupportedDegenerateOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
   EXPECT_THAT(BuildHloModule(b),
@@ -2028,52 +2061,54 @@ TEST(XlaBuilderTest,
 
 TEST(XlaBuilderTest, UnboundedCompare) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("pred[?, ?, 2, 2, <=2, <=2, ?]"));
   Compare(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
           /*direction=*/{});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConcatenate) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1,
                           ParseShape("f32[3, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2,
                           ParseShape("f32[?, 4, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3,
                           ParseShape("f32[?, ?, 2, 2, <=2, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[3, 4, ?, 2, <=2, <=2, ?]"));
   ConcatInDim(&b,
               {Parameter(&b, 0, operand1, "operand1"),
                Parameter(&b, 1, operand2, "operand2"),
                Parameter(&b, 2, operand3, "operand3")},
               2);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConvert) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("s32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("s32[?]"));
   ConvertElementType(Parameter(&b, 0, operand, "operand"), PrimitiveType::S32);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedConvolution) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 1, ?, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 1, ?, 8]"));
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
@@ -2091,27 +2126,27 @@ TEST(XlaBuilderTest, UnboundedConvolution) {
   ConvWithGeneralDimensions(Parameter(&b, 0, lhs, "lhs"),
                             Parameter(&b, 1, rhs, "rhs"),
                             /*window_strides=*/{1, 1}, Padding::kValid, dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedDot) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Dot(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedDotGeneral) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 4, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, <=3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 4, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, <=3, 5]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_contracting_dimensions(2);
@@ -2120,16 +2155,17 @@ TEST(XlaBuilderTest, UnboundedDotGeneral) {
   dnums.add_rhs_batch_dimensions(0);
 
   DotGeneral(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"), dnums);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedGather) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[3, 4, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape start_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 2, 2]"));
 
   GatherDimensionNumbers dimension_numbers;
   dimension_numbers.add_offset_dims(2);
@@ -2142,15 +2178,30 @@ TEST(XlaBuilderTest, UnboundedGather) {
   Gather(Parameter(&b, 0, operand, "operand"),
          Parameter(&b, 1, start_indices, "start_indices"), dimension_numbers,
          /*slice_sizes=*/{1, 2, 2});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, UnboundedOr) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("s32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("s32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape("s32[?, ?, 2, 2, <=2, <=2, ?]"));
+  Or(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
+     /*broadcast_dimensions=*/empty_array);
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedPad) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 21]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 21]"));
   PaddingConfig padding_config;
   for (int i = 0; i < 2; i++) {
     auto dimension = padding_config.add_dimensions();
@@ -2160,21 +2211,21 @@ TEST(XlaBuilderTest, UnboundedPad) {
   }
   Pad(Parameter(&b, 0, operand, "operand"),
       /*padding_value=*/ConstantR0<float>(&b, 0), padding_config);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReduce) {
   XlaBuilder b(TestName());
-  Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
-  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  const Shape shape = ShapeUtil::MakeShape(F32, {7}, {false});
+  const Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[7, ?]"));
-  Shape scalar_f32 = ShapeUtil::MakeShape(F32, {});
-  XlaOp init = Parameter(&b, 3, scalar_f32, "init");
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[7, ?]"));
+  const Shape scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  const XlaOp init = Parameter(&b, 3, scalar_f32, "init");
 
   XlaBuilder bsum(TestName());
   std::vector<XlaOp> output_operands = {
@@ -2185,50 +2236,50 @@ TEST(XlaBuilderTest, UnboundedReduce) {
       Add(Parameter(&bsum, 4, scalar_f32, "arg4"),
           Parameter(&bsum, 5, scalar_f32, "arg5"))};
   Tuple(&bsum, absl::MakeSpan(output_operands));
-  TF_ASSERT_OK_AND_ASSIGN(XlaComputation sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const XlaComputation sum, bsum.Build());
   Reduce(
       &b,
       {Parameter(&b, 0, input0, "input0"), Parameter(&b, 1, input1, "input1"),
        Parameter(&b, 2, input2, "input2")},
       {init, init, init}, sum, {1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReduceWindow) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, 4, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 3, 5]"));
 
   XlaBuilder bsum(TestName());
   Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
       Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
-  TF_ASSERT_OK_AND_ASSIGN(XlaComputation sum, bsum.Build());
+  TF_ASSERT_OK_AND_ASSIGN(const XlaComputation sum, bsum.Build());
 
   ReduceWindow(Parameter(&b, 0, input, "input"), ConstantR0<float>(&b, 0.f),
                sum,
                /*window_dimensions=*/{1, 2, 4},
                /*window_strides=*/{1, 1, 1}, Padding::kValid);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReshape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,3]"));
   Reshape(Parameter(&b, 0, operand, "operand"), /*dimensions=*/{0},
           /*new_sizes=*/{2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedReshapeUnsupportedOutputShape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[6]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[6]"));
   Reshape(Parameter(&b, 0, operand, "operand"), /*dimensions=*/{0},
           /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize});
   EXPECT_THAT(
@@ -2240,7 +2291,7 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedOutputShape) {
 
 TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
   Reshape(operand, Parameter(&b, 0, operand, "operand"));
   EXPECT_THAT(
       BuildHloModule(b),
@@ -2251,18 +2302,19 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
 
 TEST(XlaBuilderTest, UnboundedScatter) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, ?]"));
 
   XlaComputation update_computation;
   {
-    std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
-    XlaOp arg0 = Parameter(sub_builder.get(), 0,
-                           ShapeUtil::MakeScalarShape(F32), "arg0");
-    XlaOp arg1 = Parameter(sub_builder.get(), 1,
-                           ShapeUtil::MakeScalarShape(F32), "arg1");
+    const std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    const XlaOp arg0 = Parameter(sub_builder.get(), 0,
+                                 ShapeUtil::MakeScalarShape(F32), "arg0");
+    const XlaOp arg1 = Parameter(sub_builder.get(), 1,
+                                 ShapeUtil::MakeScalarShape(F32), "arg1");
     Add(arg0, arg1);
     TF_ASSERT_OK_AND_ASSIGN(update_computation, sub_builder->Build());
   }
@@ -2281,73 +2333,76 @@ TEST(XlaBuilderTest, UnboundedScatter) {
           dimension_numbers, /*indices_are_sorted=*/false,
           /*unique_indices=*/false);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelect) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs,
+                          ParseShape("pred[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs,
+                          ParseShape("f32[?, 1, ?, 2, ?, <=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs,
+                          ParseShape("f32[1, ?, 2, ?, <=2, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[1, 1, 2, 2, <=2, <=2, ?]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPred) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarOnTrueOnFalseImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPredOnFalseImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectScalarPredOnTrueImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2355,9 +2410,9 @@ TEST(XlaBuilderTest, UnboundedSelectScalarPredOnTrueImplicitBroadcast) {
 TEST(XlaBuilderTest,
      UnboundedSelectUnsupportedDegenerateOperandImplicitBroadcast) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[1]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("pred[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
   EXPECT_THAT(BuildHloModule(b),
@@ -2366,26 +2421,26 @@ TEST(XlaBuilderTest,
 
 TEST(XlaBuilderTest, UnboundedSlice) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
   Slice(Parameter(&b, 0, operand, "operand"),
         /*start_indices=*/{0, 1, 2},
         /*limit_indices=*/{1, 3, 5},
         /*strides=*/{1, 1, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
 TEST(XlaBuilderTest, UnboundedTranspose) {
   XlaBuilder b(TestName());
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}"));
   Transpose(Parameter(&b, 0, operand, "operand"),
             /*permutation=*/{4, 0, 3, 2, 1});
-  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  TF_ASSERT_OK_AND_ASSIGN(const auto module, BuildHloModule(b));
   EXPECT_THAT(GetRoot(*module),
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
@@ -2421,27 +2476,38 @@ INSTANTIATE_TEST_SUITE_P(
     UnboundedDynamism, XlaBuilderUnboundedBinaryOpTest,
     ::testing::ValuesIn<BinaryOpTestCase>({
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Add},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Add},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Add},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Div},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Atan2},
+        {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Div},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Div},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Max},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Max},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Max},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Mul},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Mul},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Mul},
+        {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
+         "pred[?, 10]", &Ne},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Pow},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Pow},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Pow},
         {"f32[1, ?, 2, ?, <=2, ?, ?]", "f32[?, 1, ?, 2, ?, <=2, ?]",
-         /*broadcast_dimensions=*/{}, "f32[?, ?, 2, 2, <=2, <=2, ?]", &Sub},
+         /*broadcast_dimensions=*/empty_array, "f32[?, ?, 2, 2, <=2, <=2, ?]",
+         &Sub},
         {"f32[?, 10]", "f32[1]", /*broadcast_dimensions=*/zero_array,
          "f32[?, 10]", &Sub},
     }));
diff --git a/third_party/xla/xla/client/xla_computation.cc b/third_party/xla/xla/client/xla_computation.cc
index 67ef21bd820642..c92de63495d190 100644
--- a/third_party/xla/xla/client/xla_computation.cc
+++ b/third_party/xla/xla/client/xla_computation.cc
@@ -23,12 +23,12 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
+absl::StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   TF_RET_CHECK(proto_.has_host_program_shape());
   return ProgramShape(proto_.host_program_shape());
 }
 
-StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
+absl::StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
   if (IsNull()) {
     return InvalidArgument("Computation is invalid.");
   }
diff --git a/third_party/xla/xla/client/xla_computation.h b/third_party/xla/xla/client/xla_computation.h
index 73b44fd4e1a040..e21a92d6300654 100644
--- a/third_party/xla/xla/client/xla_computation.h
+++ b/third_party/xla/xla/client/xla_computation.h
@@ -45,7 +45,7 @@ class XlaComputation {
 
   // Returns the "program shape" (parameter and return shapes) for this
   // computation.
-  StatusOr<ProgramShape> GetProgramShape() const;
+  absl::StatusOr<ProgramShape> GetProgramShape() const;
 
   const std::string& name() const { return proto().name(); }
 
@@ -54,7 +54,7 @@ class XlaComputation {
 
   // Requests that we snapshot the computation into a serializable protocol
   // buffer form.
-  StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
+  absl::StatusOr<std::unique_ptr<HloSnapshot>> Snapshot() const;
 
   // Returns true if this object is a null Computation.
   bool IsNull() const { return unique_id_ == -1; }
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index f4f9f865e4d889..8b29c823b1d921 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -36,9 +36,9 @@ limitations under the License.
 #include "xla/debug_options_parsers.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/stream_executor/cuda/ptx_compiler_support.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/debug_options_flags.h b/third_party/xla/xla/debug_options_flags.h
index 15cc0fb8b448f5..4bc8420441af1e 100644
--- a/third_party/xla/xla/debug_options_flags.h
+++ b/third_party/xla/xla/debug_options_flags.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index e220f8a1d46c77..fc0f1066e73f16 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -1,5 +1,5 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -81,6 +81,8 @@ xla_cc_test(
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 399f233a211f38..fcd33d23fef49a 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index bc21d1856a8f85..f3595cc20195c3 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -57,20 +57,34 @@ limitations under the License.
 
 #include "xla/ffi/api/c_api.h"
 
+#ifdef __has_builtin
+#define XLA_FFI_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define XLA_FFI_HAS_BUILTIN(x) 0
+#endif
+
 #if __has_attribute(always_inline)
-#define XLA_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
-#define XLA_ATTRIBUTE_ALWAYS_INLINE __forceinline
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #else
-#define XLA_ATTRIBUTE_ALWAYS_INLINE inline
+#define XLA_FFI_ATTRIBUTE_ALWAYS_INLINE inline
 #endif
 
 #if __has_attribute(noinline)
-#define XLA_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
 #elif defined(_MSC_VER)
-#define XLA_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
+#else
+#define XLA_FFI_ATTRIBUTE_NEVER_INLINE
+#endif
+
+#if XLA_FFI_HAS_BUILTIN(__builtin_expect)
+#define XLA_FFI_PREDICT_FALSE(x) (__builtin_expect(false || (x), false))
+#define XLA_FFI_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
 #else
-#define XLA_ATTRIBUTE_NEVER_INLINE
+#define XLA_FFI_PREDICT_FALSE(x) (x)
+#define XLA_FFI_PREDICT_TRUE(x) (x)
 #endif
 
 namespace xla::ffi {
@@ -107,10 +121,9 @@ class Ffi {
 
   // Registers handler with an XLA runtime under the given name on a given
   // platform.
-  static inline XLA_FFI_Error* RegisterStaticHandler(const XLA_FFI_Api* api,
-                                                     std::string_view name,
-                                                     std::string_view platform,
-                                                     XLA_FFI_Handler* handler);
+  static inline XLA_FFI_Error* RegisterStaticHandler(
+      const XLA_FFI_Api* api, std::string_view name, std::string_view platform,
+      XLA_FFI_Handler* handler, XLA_FFI_Handler_Traits traits = 0);
 
  protected:
   template <typename... Args>
@@ -131,7 +144,8 @@ class Ffi {
 XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
                                           std::string_view name,
                                           std::string_view platform,
-                                          XLA_FFI_Handler* handler) {
+                                          XLA_FFI_Handler* handler,
+                                          XLA_FFI_Handler_Traits traits) {
   // Make copies of string views to guarantee they are null terminated.
   std::string name_str(name);
   std::string platform_str(platform);
@@ -142,6 +156,7 @@ XLA_FFI_Error* Ffi::RegisterStaticHandler(const XLA_FFI_Api* api,
   args.name = name_str.c_str();
   args.platform = platform_str.c_str();
   args.handler = handler;
+  args.traits = traits;
   return api->XLA_FFI_Handler_Register(&args);
 }
 
@@ -651,7 +666,7 @@ struct DecodingContext {
 
 template <typename T>
 struct Decode {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
     int64_t idx = offsets.args++;
@@ -685,7 +700,10 @@ struct internal::Decode<internal::AttrTag<T>> {
 
     // Attribute name does not match.
     std::string_view attr_name_view = {attr_name->ptr, attr_name->len};
-    if (attr_name_view != ctx.attrs_names[i]) return std::nullopt;
+    if (attr_name_view != ctx.attrs_names[i]) {
+      return diagnostic.Emit("Attribute name mismatch: ")
+             << attr_name_view << " vs " << ctx.attrs_names[i];
+    }
 
     return AttrDecoding<T>::Decode(attr_type, attr, diagnostic);
   }
@@ -968,14 +986,14 @@ class Handler : public Ffi {
     // Check that the number of passed arguments matches the signature. Each
     // individual argument decoding will check the actual type.
     if (internal::HasRemainingArgsTag<Ts...>::value) {
-      if (call_frame->args.num_args < kNumArgs) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args < kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected at least ",
                    kNumArgs - 1, " but got ", call_frame->args.num_args));
       }
     } else {
-      if (call_frame->args.num_args != kNumArgs) {
+      if (XLA_FFI_PREDICT_FALSE(call_frame->args.num_args != kNumArgs)) {
         return InvalidArgument(
             call_frame->api,
             StrCat("Wrong number of arguments: expected ", kNumArgs,
@@ -988,7 +1006,8 @@ class Handler : public Ffi {
     // attributes into a dictionary (or a custom struct decoded from a
     // dictionary), then there is no need to check attributes, as the FFI
     // handler (or a struct decoding) should be responsible for it.
-    if (kNumDictAttrs == 0 && call_frame->attrs.num_attrs != kNumAttrs) {
+    if (XLA_FFI_PREDICT_FALSE(kNumDictAttrs == 0 &&
+                              call_frame->attrs.num_attrs != kNumAttrs)) {
       return InvalidArgument(
           call_frame->api,
           StrCat("Wrong number of attributes: expected ", kNumAttrs,
@@ -1003,7 +1022,7 @@ class Handler : public Ffi {
 
  private:
   template <size_t... Is>
-  XLA_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE XLA_FFI_Error* Call(
       const XLA_FFI_CallFrame* call_frame, std::index_sequence<Is...>) const {
     // A helper structure to allow each decoder find the correct offset.
     internal::DecodingOffsets offsets;
@@ -1018,7 +1037,7 @@ class Handler : public Ffi {
         internal::Decode<Ts>::call(offsets, ctx, diagnostic)...};
 
     bool all_decoded = (std::get<Is>(args).has_value() && ...);
-    if (!all_decoded) {
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) {
       return FailedDecodeError(call_frame, {std::get<Is>(args).has_value()...},
                                diagnostic);
     }
@@ -1086,16 +1105,14 @@ class Handler : public Ffi {
 
 inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
   switch (type) {
-    case XLA_FFI_AttrType_I32:
-      return os << "int32";
-    case XLA_FFI_AttrType_I64:
-      return os << "int64";
-    case XLA_FFI_AttrType_F32:
-      return os << "float";
-    case XLA_FFI_AttrType_STRING:
-      return os << "string";
+    case XLA_FFI_AttrType_ARRAY:
+      return os << "array";
     case XLA_FFI_AttrType_DICTIONARY:
       return os << "dictionary";
+    case XLA_FFI_AttrType_SCALAR:
+      return os << "scalar";
+    case XLA_FFI_AttrType_STRING:
+      return os << "string";
   }
 }
 
@@ -1105,18 +1122,24 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
     using Type = T;                                                   \
     static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
                                    DiagnosticEngine& diagnostic) {    \
-      if (type != TYPE) {                                             \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR)) {   \
         return diagnostic.Emit("Wrong attribute type: expected ")     \
-               << TYPE << " but got " << type;                        \
+               << XLA_FFI_AttrType_SCALAR << " but got " << type;     \
       }                                                               \
                                                                       \
-      return *reinterpret_cast<T*>(attr);                             \
+      auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);         \
+      if (XLA_FFI_PREDICT_FALSE(scalar->dtype != TYPE)) {             \
+        return diagnostic.Emit("Wrong scalar data type: expected ")   \
+               << TYPE << " but got " << scalar->dtype;               \
+      }                                                               \
+                                                                      \
+      return *reinterpret_cast<T*>(scalar->value);                    \
     }                                                                 \
   }
 
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_AttrType_I32);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_AttrType_I64);
-XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_AttrType_F32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_DataType_F32);
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
@@ -1126,7 +1149,7 @@ struct AttrDecoding<std::string_view> {
   static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
                                                 void* attr,
                                                 DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_STRING) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_STRING)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_STRING << " but got " << type;
     }
@@ -1141,7 +1164,7 @@ struct AttrDecoding<Dictionary> {
   using Type = Dictionary;
   static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_DICTIONARY) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) {
       return diagnostic.Emit("Wrong attribute type: expected ")
              << XLA_FFI_AttrType_DICTIONARY << " but got " << type;
     }
@@ -1171,7 +1194,7 @@ template <typename T, typename... Ts>
 struct DecodeDictionaryAttr {
   static constexpr size_t kSize = sizeof...(Ts);
 
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<T> Decode(const XLA_FFI_Attrs* attrs,
                                  std::array<std::string_view, kSize> names,
                                  DiagnosticEngine& diagnostic) {
@@ -1179,10 +1202,10 @@ struct DecodeDictionaryAttr {
   }
 
   template <size_t... Is>
-  XLA_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE static std::optional<T> Decode(
       const XLA_FFI_Attrs* attrs, std::array<std::string_view, kSize> names,
       std::index_sequence<Is...>, DiagnosticEngine& diagnostic) {
-    if (kSize != attrs->num_attrs) {
+    if (XLA_FFI_PREDICT_FALSE(kSize != attrs->num_attrs)) {
       return diagnostic.Emit("Wrong number of attributes: expected ")
              << kSize << " attributes but got " << attrs->num_attrs;
     }
@@ -1200,7 +1223,7 @@ struct DecodeDictionaryAttr {
     std::tuple<std::optional<Ts>...> members = {
         dict.get<Ts>(names[Is], diagnostic)...};
     bool all_decoded = (std::get<Is>(members).has_value() && ...);
-    if (!all_decoded) return std::nullopt;
+    if (XLA_FFI_PREDICT_FALSE(!all_decoded)) return std::nullopt;
 
     return T{std::move(*std::get<Is>(members))...};
   }
@@ -1230,28 +1253,28 @@ auto DictionaryDecoder(Members... m) {
 // Automatically registers attributes binding for a struct that allows automatic
 // binding specification inference from a callable signature.
 //
-#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                 \
-  template <>                                                         \
-  struct AttrsBinding<T> {                                            \
-    using Attrs = T;                                                  \
-  };                                                                  \
-                                                                      \
-  template <>                                                         \
-  struct AttrDecoding<T> {                                            \
-    using Type = T;                                                   \
-    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
-                                   DiagnosticEngine& diagnostic) {    \
-      if (type != XLA_FFI_AttrType_DICTIONARY) {                      \
-        diagnostic.Emit("Wrong attribute type: expected ")            \
-            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;    \
-        return std::nullopt;                                          \
-      }                                                               \
-                                                                      \
-      auto decoder = internal::DictionaryDecoder<T>(__VA_ARGS__);     \
-      return decltype(decoder)::Decode(                               \
-          reinterpret_cast<const XLA_FFI_Attrs*>(attr),               \
-          internal::StructMemberNames(__VA_ARGS__), diagnostic);      \
-    }                                                                 \
+#define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                   \
+  template <>                                                           \
+  struct AttrsBinding<T> {                                              \
+    using Attrs = T;                                                    \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct AttrDecoding<T> {                                              \
+    using Type = T;                                                     \
+    static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr,   \
+                                   DiagnosticEngine& diagnostic) {      \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_DICTIONARY)) { \
+        diagnostic.Emit("Wrong attribute type: expected ")              \
+            << XLA_FFI_AttrType_DICTIONARY << " but got " << type;      \
+        return std::nullopt;                                            \
+      }                                                                 \
+                                                                        \
+      auto decoder = internal::DictionaryDecoder<T>(__VA_ARGS__);       \
+      return decltype(decoder)::Decode(                                 \
+          reinterpret_cast<const XLA_FFI_Attrs*>(attr),                 \
+          internal::StructMemberNames(__VA_ARGS__), diagnostic);        \
+    }                                                                   \
   }
 
 //===----------------------------------------------------------------------===//
@@ -1294,15 +1317,16 @@ auto DictionaryDecoder(Members... m) {
 // TODO(ezhulenev): Add a callback so that end users can log registration error
 // to appropriate logging destination, e.g. LOG(FATAL) for duplicate internal
 // FFI handlers.
-#define XLA_FFI_REGISTER_HANDLER(API, NAME, PLATFORM, FUNC) \
-  XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, __COUNTER__)
-#define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N) \
-  XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N)
-#define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N)           \
-  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                     \
-      xla_ffi_static_handler_##N##_registered_ = [] {                      \
-        return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM, \
-                                                      FUNC);               \
+#define XLA_FFI_REGISTER_HANDLER(API, NAME, PLATFORM, FUNC, ...)    \
+  XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, __COUNTER__, \
+                            ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER_(API, NAME, PLATFORM, FUNC, N, ...) \
+  XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ##__VA_ARGS__)
+#define XLA_FFI_REGISTER_HANDLER__(API, NAME, PLATFORM, FUNC, N, ...)       \
+  XLA_FFI_ATTRIBUTE_UNUSED static const XLA_FFI_Error*                      \
+      xla_ffi_static_handler_##N##_registered_ = [] {                       \
+        return ::xla::ffi::Ffi::RegisterStaticHandler(API, NAME, PLATFORM,  \
+                                                      FUNC, ##__VA_ARGS__); \
       }()
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/api/c_api.h b/third_party/xla/xla/ffi/api/c_api.h
index 114b2b4f6fbf1a..e7ebd8da060796 100644
--- a/third_party/xla/xla/ffi/api/c_api.h
+++ b/third_party/xla/xla/ffi/api/c_api.h
@@ -192,11 +192,10 @@ typedef enum {
 //===----------------------------------------------------------------------===//
 
 typedef enum {
-  XLA_FFI_AttrType_I32 = 1,
-  XLA_FFI_AttrType_I64 = 2,
-  XLA_FFI_AttrType_F32 = 3,
+  XLA_FFI_AttrType_ARRAY = 1,
+  XLA_FFI_AttrType_DICTIONARY = 2,
+  XLA_FFI_AttrType_SCALAR = 3,
   XLA_FFI_AttrType_STRING = 4,
-  XLA_FFI_AttrType_DICTIONARY = 5,
 } XLA_FFI_AttrType;
 
 //===----------------------------------------------------------------------===//
@@ -223,6 +222,29 @@ struct XLA_FFI_ByteSpan {
 
 XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_ByteSpan, len);
 
+// A struct to pass a scalar value to FFI handler.
+struct XLA_FFI_Scalar {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_DataType dtype;
+  void* value;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Scalar, value);
+
+// A struct to pass a dense array to FFI handler.
+struct XLA_FFI_Array {
+  size_t struct_size;
+  void* priv;
+
+  XLA_FFI_DataType dtype;
+  size_t size;
+  void* data;
+};
+
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Array, data);
+
 struct XLA_FFI_Args {
   size_t struct_size;
   void* priv;
@@ -267,6 +289,15 @@ XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_CallFrame, attrs);
 // External functions registered with XLA as FFI handlers.
 typedef XLA_FFI_Error* XLA_FFI_Handler(XLA_FFI_CallFrame* call_frame);
 
+enum XLA_FFI_Handler_TraitsBits {
+  // Calls to FFI handler are safe to trace into the command buffer. It means
+  // that calls to FFI handler always launch exactly the same device operations
+  // (can depend on attribute values) that can be captured and then replayed.
+  XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE = 1u << 0,
+};
+
+typedef uint32_t XLA_FFI_Handler_Traits;
+
 struct XLA_FFI_Handler_Register_Args {
   size_t struct_size;
   void* priv;
@@ -274,9 +305,10 @@ struct XLA_FFI_Handler_Register_Args {
   const char* name;      // null terminated
   const char* platform;  // null terminated
   XLA_FFI_Handler* handler;
+  XLA_FFI_Handler_Traits traits;
 };
 
-XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, handler);
+XLA_FFI_DEFINE_STRUCT_TRAITS(XLA_FFI_Handler_Register_Args, traits);
 
 typedef XLA_FFI_Error* XLA_FFI_Handler_Register(
     XLA_FFI_Handler_Register_Args* args);
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index f6d742656a2a79..ab7a539c707215 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -198,10 +198,10 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_ArgType type) {
 
 template <>
 struct ArgDecoding<BufferBase> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -213,10 +213,10 @@ struct ArgDecoding<BufferBase> {
 
 template <DataType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -224,13 +224,13 @@ struct ArgDecoding<Buffer<dtype, rank>> {
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
     if (auto actual_dtype = static_cast<DataType>(buf->dtype);
-        actual_dtype != dtype) {
+        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
       return diagnostic.Emit("Wrong buffer dtype: expected ")
              << dtype << " but got " << actual_dtype;
     }
 
     if constexpr (rank != internal::kDynamicRank) {
-      if (buf->rank != rank) {
+      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
         return diagnostic.Emit("Wrong buffer rank: expected ")
                << rank << " but got " << buf->rank;
       }
@@ -243,6 +243,60 @@ struct ArgDecoding<Buffer<dtype, rank>> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Attributes decoding
+//===----------------------------------------------------------------------===//
+
+#define XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(T, TYPE)                      \
+  template <>                                                               \
+  struct AttrDecoding<Span<const T>> {                                      \
+    using Type = Span<const T>;                                             \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,    \
+                                      DiagnosticEngine& diagnostic) {       \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {          \
+        return diagnostic.Emit("Wrong attribute type: expected ")           \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;            \
+      }                                                                     \
+                                                                            \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);                 \
+      if (XLA_FFI_PREDICT_FALSE(array->dtype != TYPE)) {                    \
+        return diagnostic.Emit("Wrong array data type: expected ")          \
+               << TYPE << " but got " << array->dtype;                      \
+      }                                                                     \
+                                                                            \
+      return Span<const T>(reinterpret_cast<T*>(array->data), array->size); \
+    }                                                                       \
+  }
+
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
+// A type tag to mark i64 attributes as pointers to `T`.
+template <typename T>
+struct Pointer {};
+
+template <typename T>
+struct AttrDecoding<Pointer<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
+                                    DiagnosticEngine& diagnostic) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                              scalar->dtype != XLA_FFI_DataType_S64)) {
+      return diagnostic.Emit("Wrong attribute type: ")
+             << "expected i64 scalar for passing pointer but got " << type;
+    }
+
+    static_assert(sizeof(uintptr_t) == sizeof(int64_t));
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Result encoding
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index af720f2686370c..f389a2ac05fb19 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -231,6 +231,55 @@ TEST(FfiTest, BindingPlatformStreamInference) {
   (void)Ffi::BindTo(+[](TestStream stream) { return Error::Success(); });
 }
 
+TEST(FfiTest, ArrayAttr) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](Span<const int32_t> arr) {
+    EXPECT_EQ(arr.size(), 4);
+    EXPECT_EQ(arr[0], 1);
+    EXPECT_EQ(arr[1], 2);
+    EXPECT_EQ(arr[2], 3);
+    EXPECT_EQ(arr[3], 4);
+    return Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Attr<Span<const int32_t>>("arr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, PointerAttr) {
+  std::string foo = "foo";
+
+  // Test for convenience attr binding that casts i64 attribute to user-type
+  // pointers. It's up to the user to guarantee that pointer is valid.
+  auto ptr = reinterpret_cast<uintptr_t>(&foo);
+  static_assert(sizeof(ptr) == sizeof(int64_t));
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("ptr", static_cast<int64_t>(ptr));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](const std::string* str) {
+    EXPECT_EQ(*str, "foo");
+    return Error::Success();
+  };
+
+  auto handler = Ffi::Bind().Attr<Pointer<std::string>>("ptr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks are below.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index bbdbd7544357e4..06dd10fb0833c0 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -125,6 +126,19 @@ struct CallFrame::Dictionary {
   std::unique_ptr<Attributes> attrs;
 };
 
+struct CallFrame::Array {
+  std::variant<std::vector<int32_t>, std::vector<int64_t>, std::vector<float>>
+      value;  // XLA_FFI_Array::data
+
+  XLA_FFI_Array array = {XLA_FFI_Array_STRUCT_SIZE, nullptr};
+};
+
+struct CallFrame::Scalar {
+  std::variant<int32_t, int64_t, float> value;  // XLA_FFI_Scalar::value
+
+  XLA_FFI_Scalar scalar = {XLA_FFI_Scalar_STRUCT_SIZE, nullptr};
+};
+
 struct CallFrame::String {
   std::string value;  // XLA_FFI_ByteSpan::ptr
 
@@ -202,6 +216,7 @@ CallFrame::~CallFrame() = default;
   auto to_data_type = [](PrimitiveType primitive_type) {
     switch (primitive_type) {
       case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+      case PrimitiveType::PRED:
       case PrimitiveType::S8:
       case PrimitiveType::S16:
       case PrimitiveType::S32:
@@ -258,9 +273,12 @@ CallFrame::~CallFrame() = default;
 // An std::visit overload set for converting CallFrameBuilder::Attribute to
 // CallFrame::Attribute.
 struct CallFrame::ConvertAttribute {
-  template <typename T>
-  CallFrame::Attribute operator()(const T& value) {
-    return value;
+  CallFrame::Attribute operator()(const CallFrameBuilder::Array& array) {
+    return CallFrame::Array{array};
+  }
+
+  CallFrame::Attribute operator()(const CallFrameBuilder::Scalar& scalar) {
+    return CallFrame::Scalar{scalar};
   }
 
   CallFrame::Attribute operator()(const std::string& str) {
@@ -272,25 +290,58 @@ struct CallFrame::ConvertAttribute {
   }
 };
 
+template <typename T>
+static XLA_FFI_DataType GetDataType() {
+  if constexpr (std::is_same_v<int32_t, T>) {
+    return XLA_FFI_DataType_S32;
+  } else if constexpr (std::is_same_v<int64_t, T>) {
+    return XLA_FFI_DataType_S64;
+  } else if constexpr (std::is_same_v<float, T>) {
+    return XLA_FFI_DataType_F32;
+  } else {
+    static_assert(sizeof(T) == 0, "unsupported FFI data type");
+  }
+}
+
 // An std::visit overload set to fix up CallFrame::Attribute storage and
 // initialize XLA FFI structs with valid pointers into storage objects.
 struct CallFrame::FixupAttribute {
-  template <typename T>
-  void operator()(T& value) {}
+  void operator()(CallFrame::Array& array) {
+    auto visitor = [&](auto& value) {
+      using T = typename std::remove_reference_t<decltype(value)>::value_type;
+      array.array.dtype = GetDataType<T>();
+      array.array.size = value.size();
+      array.array.data = value.data();
+    };
+    std::visit(visitor, array.value);
+  }
+
+  void operator()(CallFrame::Scalar& scalar) {
+    auto visitor = [&](auto& value) {
+      using T = std::remove_reference_t<decltype(value)>;
+      scalar.scalar.dtype = GetDataType<T>();
+      scalar.scalar.value = &value;
+    };
+    std::visit(visitor, scalar.value);
+  }
 
   void operator()(CallFrame::String& str) {
     str.span.ptr = str.value.data();
     str.span.len = str.value.size();
   }
+
+  void operator()(CallFrame::Dictionary&) {}
 };
 
 // An std::visit overload set to get CallFrame::Attribute XLA FFI type.
 struct CallFrame::AttributeType {
-  XLA_FFI_AttrType operator()(int32_t&) { return XLA_FFI_AttrType_I32; }
-
-  XLA_FFI_AttrType operator()(int64_t&) { return XLA_FFI_AttrType_I64; }
+  XLA_FFI_AttrType operator()(CallFrame::Array&) {
+    return XLA_FFI_AttrType_ARRAY;
+  }
 
-  XLA_FFI_AttrType operator()(float&) { return XLA_FFI_AttrType_F32; }
+  XLA_FFI_AttrType operator()(CallFrame::Scalar&) {
+    return XLA_FFI_AttrType_SCALAR;
+  }
 
   XLA_FFI_AttrType operator()(CallFrame::String&) {
     return XLA_FFI_AttrType_STRING;
@@ -308,6 +359,10 @@ struct CallFrame::AttributeStorage {
     return &value;
   }
 
+  void* operator()(CallFrame::Array& array) { return &array.array; }
+
+  void* operator()(CallFrame::Scalar& scalar) { return &scalar.scalar; }
+
   void* operator()(CallFrame::String& str) { return &str.span; }
 
   void* operator()(CallFrame::Dictionary& dict) {
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index b765dc887d8fcd..b8f05105d92343 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -57,11 +57,15 @@ class CallFrameBuilder {
   CallFrameBuilder(CallFrameBuilder&&);
   CallFrameBuilder& operator=(CallFrameBuilder&&);
 
+  using Scalar = std::variant<int32_t, int64_t, float>;
+  using Array = std::variant<std::vector<int32_t>, std::vector<int64_t>,
+                             std::vector<float>>;
+
   // Declare implementation detail structs for call frame builder storage.
   struct Dictionary;
 
   // Attributes that do not support nested dictionaries.
-  using FlatAttribute = std::variant<int32_t, int64_t, float, std::string>;
+  using FlatAttribute = std::variant<Scalar, Array, std::string>;
   using FlatAttributesMap = absl::flat_hash_map<std::string, FlatAttribute>;
 
   // Attributes that support arbitrary nesting.
@@ -123,13 +127,15 @@ class CallFrame {
 
   // Declare implementation detail structs for call frame storage.
   struct Arguments;
+  struct Array;
   struct Attributes;
   struct Buffer;
   struct Dictionary;
   struct NamedAttribute;
+  struct Scalar;
   struct String;
 
-  using Attribute = std::variant<int32_t, int64_t, float, String, Dictionary>;
+  using Attribute = std::variant<Scalar, Array, String, Dictionary>;
 
   CallFrame(absl::Span<const CallFrameBuilder::Buffer> args,
             const CallFrameBuilder::AttributesMap& attrs);
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 38939c4fb2f630..6720b641ee25a5 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -109,10 +109,10 @@ struct ArgBinding<Buffer<dtype, rank>> {
 
 template <>
 struct ArgDecoding<BufferBase> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
                                           DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
@@ -129,24 +129,25 @@ struct ArgDecoding<BufferBase> {
 
 template <PrimitiveType dtype, size_t rank>
 struct ArgDecoding<Buffer<dtype, rank>> {
-  XLA_ATTRIBUTE_ALWAYS_INLINE
+  XLA_FFI_ATTRIBUTE_ALWAYS_INLINE
   static std::optional<Buffer<dtype, rank>> Decode(
       XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_ArgType_BUFFER) {
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_ArgType_BUFFER)) {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
 
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
-    if (auto actual_dtype = PrimitiveType(buf->dtype); actual_dtype != dtype) {
+    if (auto actual_dtype = PrimitiveType(buf->dtype);
+        XLA_FFI_PREDICT_FALSE(actual_dtype != dtype)) {
       return diagnostic.Emit("Wrong buffer dtype: expected ")
              << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
              << primitive_util::LowercasePrimitiveTypeName(actual_dtype);
     }
 
     if constexpr (rank != internal::kDynamicRank) {
-      if (buf->rank != rank) {
+      if (XLA_FFI_PREDICT_FALSE(buf->rank != rank)) {
         return diagnostic.Emit("Wrong buffer rank: expected ")
                << rank << " but got " << buf->rank;
       }
@@ -164,6 +165,34 @@ struct ArgDecoding<Buffer<dtype, rank>> {
 // Attributes decoding
 //===----------------------------------------------------------------------===//
 
+#define XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(T, TYPE)                   \
+  template <>                                                            \
+  struct AttrDecoding<absl::Span<const T>> {                             \
+    using Type = absl::Span<const T>;                                    \
+    static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr, \
+                                      DiagnosticEngine& diagnostic) {    \
+      if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_ARRAY)) {       \
+        return diagnostic.Emit("Wrong attribute type: expected ")        \
+               << XLA_FFI_AttrType_ARRAY << " but got " << type;         \
+      }                                                                  \
+                                                                         \
+      auto* array = reinterpret_cast<XLA_FFI_Array*>(attr);              \
+      if (XLA_FFI_PREDICT_FALSE(array->dtype != TYPE)) {                 \
+        return diagnostic.Emit("Wrong array data type: expected ")       \
+               << TYPE << " but got " << array->dtype;                   \
+      }                                                                  \
+                                                                         \
+      return absl::Span<const T>(reinterpret_cast<T*>(array->data),      \
+                                 array->size);                           \
+    }                                                                    \
+  }
+
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int32_t, XLA_FFI_DataType_S32);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(int64_t, XLA_FFI_DataType_S64);
+XLA_FFI_REGISTER_ARRRAY_ATTR_DECODING(float, XLA_FFI_DataType_F32);
+
+#undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
+
 // A type tag to mark i64 attributes as pointers to `T`.
 template <typename T>
 struct Pointer {};
@@ -174,13 +203,15 @@ struct AttrDecoding<Pointer<T>> {
 
   static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
                                     DiagnosticEngine& diagnostic) {
-    if (type != XLA_FFI_AttrType_I64) {
+    auto* scalar = reinterpret_cast<XLA_FFI_Scalar*>(attr);
+    if (XLA_FFI_PREDICT_FALSE(type != XLA_FFI_AttrType_SCALAR ||
+                              scalar->dtype != XLA_FFI_DataType_S64)) {
       return diagnostic.Emit("Wrong attribute type: ")
-             << "expected i64 for passing user data but got " << type;
+             << "expected i64 scalar for passing pointer but got " << type;
     }
 
     static_assert(sizeof(uintptr_t) == sizeof(int64_t));
-    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(attr);
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(scalar->value);
     return reinterpret_cast<Type>(ptr);
   }
 };
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 3173157a10ba90..af8dbaa5d50b65 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -50,6 +50,10 @@ struct XLA_FFI_ExecutionContext {
 
 namespace xla::ffi {
 
+bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits) {
+  return traits & XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE;
+}
+
 //===----------------------------------------------------------------------===//
 // Calling XLA FFI handlers
 //===----------------------------------------------------------------------===//
@@ -84,7 +88,7 @@ Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
 //===----------------------------------------------------------------------===//
 
 using HandlerKey = std::pair<std::string, std::string>;
-using HandlerRegistry = absl::flat_hash_map<HandlerKey, XLA_FFI_Handler*>;
+using HandlerRegistry = absl::flat_hash_map<HandlerKey, HandlerRegistration>;
 
 static HandlerKey MakeHandlerKey(std::string_view name,
                                  std::string_view platform) {
@@ -97,9 +101,10 @@ static HandlerRegistry& GetHandlerRegistry() {
 }
 
 static Status RegisterHandler(std::string_view name, std::string_view platform,
-                              XLA_FFI_Handler* handler) {
-  auto emplaced =
-      GetHandlerRegistry().try_emplace(MakeHandlerKey(name, platform), handler);
+                              XLA_FFI_Handler* handler,
+                              XLA_FFI_Handler_Traits traits) {
+  auto emplaced = GetHandlerRegistry().try_emplace(
+      MakeHandlerKey(name, platform), HandlerRegistration{handler, traits});
   if (!emplaced.second)
     return absl::InvalidArgumentError(
         absl::StrCat("Duplicate FFI handler registration for ", name,
@@ -107,8 +112,8 @@ static Status RegisterHandler(std::string_view name, std::string_view platform,
   return OkStatus();
 }
 
-absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
-                                             std::string_view platform) {
+absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
+                                                std::string_view platform) {
   auto it = GetHandlerRegistry().find(MakeHandlerKey(name, platform));
   if (it == GetHandlerRegistry().end())
     return absl::NotFoundError(absl::StrCat("No FFI handler registered for ",
@@ -116,9 +121,9 @@ absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
   return it->second;
 }
 
-absl::flat_hash_map<std::string, XLA_FFI_Handler*> StaticRegisteredHandlers(
+absl::flat_hash_map<std::string, HandlerRegistration> StaticRegisteredHandlers(
     std::string_view platform) {
-  absl::flat_hash_map<std::string, XLA_FFI_Handler*> calls;
+  absl::flat_hash_map<std::string, HandlerRegistration> calls;
   for (const auto& [metadata, handler] : GetHandlerRegistry()) {
     if (absl::AsciiStrToLower(platform) == metadata.second) {
       calls[metadata.first] = handler;
@@ -236,7 +241,8 @@ static XLA_FFI_Error* XLA_FFI_Handler_Register(
       "XLA_FFI_Handler_Register", XLA_FFI_Handler_Register_Args_STRUCT_SIZE,
       args->struct_size));
 
-  if (auto status = RegisterHandler(args->name, args->platform, args->handler);
+  if (auto status = RegisterHandler(args->name, args->platform, args->handler,
+                                    args->traits);
       !status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_api.h b/third_party/xla/xla/ffi/ffi_api.h
index eae9eeda0a34c3..a90f19182ffc6e 100644
--- a/third_party/xla/xla/ffi/ffi_api.h
+++ b/third_party/xla/xla/ffi/ffi_api.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 
 namespace xla::ffi {
 
@@ -62,12 +61,20 @@ Status Call(XLA_FFI_Handler* handler, CallFrame& call_frame,
 // XLA FFI registry
 //===----------------------------------------------------------------------===//
 
+struct HandlerRegistration {
+  XLA_FFI_Handler* handler = nullptr;
+  XLA_FFI_Handler_Traits traits = 0;
+};
+
+bool IsCommandBufferCompatible(XLA_FFI_Handler_Traits traits);
+
 // Returns registered FFI handler for a given name and platform, or an error if
 // it's not found in the static registry.
-absl::StatusOr<XLA_FFI_Handler*> FindHandler(std::string_view name,
-                                             std::string_view platform);
+absl::StatusOr<HandlerRegistration> FindHandler(std::string_view name,
+                                                std::string_view platform);
+
 // Returns all registered calls in the static registry for a given platform.
-absl::flat_hash_map<std::string, XLA_FFI_Handler*> StaticRegisteredHandlers(
+absl::flat_hash_map<std::string, HandlerRegistration> StaticRegisteredHandlers(
     std::string_view platform);
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index c28da195f7ba57..56c49fc8063b7a 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/types/span.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/service/service_executable_run_options.h"
@@ -49,7 +51,8 @@ TEST(FfiTest, StaticRegistration) {
   XLA_FFI_DEFINE_HANDLER(NoOp1, noop);
 
   XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-0", "Host", NoOp0);
-  XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-1", "Host", NoOp1);
+  XLA_FFI_REGISTER_HANDLER(GetXlaFfiApi(), "no-op-1", "Host", NoOp1,
+                           XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
 
   auto handler0 = FindHandler("no-op-0", "Host");
   auto handler1 = FindHandler("no-op-1", "Host");
@@ -57,6 +60,9 @@ TEST(FfiTest, StaticRegistration) {
   TF_ASSERT_OK(handler0.status());
   TF_ASSERT_OK(handler1.status());
 
+  ASSERT_EQ(handler0->traits, 0);
+  ASSERT_EQ(handler1->traits, XLA_FFI_HANDLER_TRAITS_COMMAND_BUFFER_COMPATIBLE);
+
   EXPECT_THAT(StaticRegisteredHandlers("Host"),
               UnorderedElementsAre(Pair("no-op-0", _), Pair("no-op-1", _)));
 }
@@ -155,6 +161,29 @@ TEST(FfiTest, BuiltinAttributesAutoBinding) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, ArrayAttr) {
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("arr", std::vector<int32_t>({1, 2, 3, 4}));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](absl::Span<const int32_t> arr) {
+    EXPECT_EQ(arr.size(), 4);
+    EXPECT_EQ(arr[0], 1);
+    EXPECT_EQ(arr[1], 2);
+    EXPECT_EQ(arr[2], 3);
+    EXPECT_EQ(arr[3], 4);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Attr<absl::Span<const int32_t>>("arr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, PointerAttr) {
   std::string foo = "foo";
 
@@ -349,9 +378,18 @@ TEST(FfiTest, DecodingErrors) {
 
   auto status = Call(*handler, call_frame);
 
-  ASSERT_EQ(
+  EXPECT_TRUE(absl::StrContains(
       status.message(),
-      "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)");
+      "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: i32 vs not_i32_should_fail"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: i64 vs not_i64_should_fail"));
+
+  EXPECT_TRUE(absl::StrContains(
+      status.message(), "Attribute name mismatch: str vs not_str_should_fail"));
 }
 
 TEST(FfiTest, BufferBaseArgument) {
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 0d6b67083576f7..52ce1414644efa 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   XLA evaluator implementation.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index d661e1290b1469..aa8135cc69ea94 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -1,9 +1,9 @@
 # Automatic sharding annotation
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -318,12 +318,17 @@ xla_cc_test(
     deps = [
         ":auto_sharding",
         ":auto_sharding_option",
+        ":auto_sharding_strategy",
         ":auto_sharding_util",
         "//xla:statusor",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
         "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:buffer_value",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_memory_scheduler",
         "//xla/service:hlo_parser",
+        "//xla/service:hlo_value",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index dc7a4eb01edf79..49d4807266603b 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -162,7 +161,8 @@ std::vector<double> MemoryReshardingCostVector(
   auto required_sharding_for_resharding = required_sharding.IsTileMaximal()
                                               ? HloSharding::Replicate()
                                               : required_sharding;
-  CHECK_OK(required_sharding.Validate(operand_shape));
+  CHECK_OK(required_sharding.Validate(operand_shape))
+      << strategy_group->ToString();
   for (const auto& x : strategy_group->strategies) {
     ret.push_back(ComputeMemoryReshardingCost(operand_shape, x.output_sharding,
                                               required_sharding_for_resharding,
@@ -1421,6 +1421,9 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
           cluster_env, pretrimmed_strategy_map, call_graph, strict);
     }
   } else {
+    if (existing_sharding.IsUnknown()) {
+      return;
+    }
     if (ShardingIsComplete(existing_sharding,
                            cluster_env.device_mesh_.num_elements())) {
       // Sharding provided by XLA users, we need to keep them.
@@ -1452,27 +1455,38 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
           for (size_t i = 0; i < strategy_group->in_nodes.size(); i++) {
             HloInstruction* operand =
                 instructions.at(strategy_group->in_nodes.at(i)->instruction_id);
-            std::optional<HloSharding> input_sharding_or =
+            std::optional<HloSharding> input_sharding =
                 ShardingPropagation::GetShardingFromUser(*operand, *ins, 10,
                                                          true, call_graph);
-            if (input_sharding_or.has_value()) {
-              input_shardings.push_back(input_sharding_or.value());
-            }
-
             StrategyGroup* operand_strategy_group =
                 strategy_map.at(operand).get();
             Shape operand_shape = operand->shape();
             if (ins->opcode() == HloOpcode::kGetTupleElement) {
+              if (input_sharding && input_sharding->IsTuple()) {
+                input_sharding = input_sharding->GetSubSharding(
+                    operand->shape(), {ins->tuple_index()});
+              }
               operand_strategy_group =
                   operand_strategy_group->childs[ins->tuple_index()].get();
-              operand_shape = operand_shape.tuple_shapes(ins->tuple_index());
+              operand_shape = operand->shape().tuple_shapes(ins->tuple_index());
             }
+
+            if (input_sharding.has_value()) {
+              input_sharding = *input_sharding;
+            } else if (existing_sharding.Validate(operand_shape).ok()) {
+              input_sharding = existing_sharding;
+            } else {
+              input_sharding = HloSharding::Replicate();
+            }
+            CHECK(input_sharding.has_value());
+
+            input_shardings.push_back(*input_sharding);
             communication_resharding_costs.push_back(
                 CommunicationReshardingCostVector(
-                    operand_strategy_group, operand_shape, existing_sharding,
+                    operand_strategy_group, operand_shape, *input_sharding,
                     cluster_env));
             memory_resharding_costs.push_back(MemoryReshardingCostVector(
-                operand_strategy_group, operand_shape, existing_sharding,
+                operand_strategy_group, operand_shape, *input_sharding,
                 cluster_env));
           }
         }
@@ -1838,8 +1852,7 @@ AutoShardingSolverResult CallSolver(
   if (max_cost) {
     request.mutable_max_cost()->set_coeff(*max_cost);
   }
-  for (const auto& [edge, edge_cost] : cost_graph.edge_communication_costs_) {
-    const auto& edge_memory_cost = cost_graph.edge_memory_costs_.at(edge);
+  for (const auto& [edge, edge_cost] : cost_graph.edge_costs_) {
     AutoShardingSolverRequest_Pair raw_edge;
     raw_edge.set_first(edge.first);
     raw_edge.set_second(edge.second);
@@ -1848,8 +1861,8 @@ AutoShardingSolverResult CallSolver(
     AutoShardingSolverRequest_Costs mij;
     for (NodeStrategyIdx i = 0; i < edge_cost.n_; i++) {
       for (NodeStrategyIdx j = 0; j < edge_cost.m_; j++) {
-        rij.add_costs(edge_cost(i, j));
-        mij.add_costs(edge_memory_cost(i, j));
+        rij.add_costs(edge_cost(i, j).communication_cost);
+        mij.add_costs(edge_cost(i, j).memory_cost);
       }
     }
     request.mutable_resharding_costs()->Add(std::move(rij));
@@ -1915,8 +1928,8 @@ AutoShardingSolverResult CallSolver(
   for (const auto& pair : alias_set) {
     const StrategyGroup* src_strategy_group = strategy_groups[pair.first];
     const StrategyGroup* dst_strategy_group = strategy_groups[pair.second];
-    Matrix raw_cost(src_strategy_group->strategies.size(),
-                    dst_strategy_group->strategies.size());
+    Matrix<double> raw_cost(src_strategy_group->strategies.size(),
+                            dst_strategy_group->strategies.size());
     for (NodeStrategyIdx i = 0; i < src_strategy_group->strategies.size();
          ++i) {
       for (NodeStrategyIdx j = 0; j < dst_strategy_group->strategies.size();
@@ -2165,7 +2178,7 @@ Status SetHloShardingPostProcessing(
     const HloInstructionSequence& sequence, const StrategyMap& strategy_map,
     const CostGraph& cost_graph, absl::Span<const NodeStrategyIdx> s_val,
     const ClusterEnvironment& cluster_env, const bool crash_at_error,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
@@ -2255,8 +2268,8 @@ Status SetHloShardingPostProcessing(
       // In the analysis itself, we use replicated strategies as a stand-in for
       // the (expected) maximal sharding annotations that send-done ops usually
       // have. Here we restore these maximal shardings if present.
-      auto preserved_sharding_iter = preserve_shardings->find(inst->name());
-      if (preserved_sharding_iter != preserve_shardings->end()) {
+      auto preserved_sharding_iter = preserve_shardings.find(inst->name());
+      if (preserved_sharding_iter != preserve_shardings.end()) {
         const auto& preserved_sharding = preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           std::vector<Shape> tuple_elements_shape(
@@ -2281,8 +2294,8 @@ Status SetHloShardingPostProcessing(
       // In the analysis itself, we use replicated strategies as a stand-in for
       // the (expected) maximal sharding annotations that send ops usually
       // have. Here we restore these maximal shardings if present.
-      auto preserved_sharding_iter = preserve_shardings->find(inst->name());
-      if (preserved_sharding_iter != preserve_shardings->end()) {
+      auto preserved_sharding_iter = preserve_shardings.find(inst->name());
+      if (preserved_sharding_iter != preserve_shardings.end()) {
         const auto& preserved_sharding = preserved_sharding_iter->second;
         if (preserved_sharding.size() > 1) {
           inst->set_sharding(
@@ -2295,6 +2308,15 @@ Status SetHloShardingPostProcessing(
       continue;
     } else {
       if (inst->shape().IsTuple()) {
+        // While we do not support nested tuples fully, this is a hack to get
+        // things to work in some cases (specifically observed for the llama and
+        // gemma models) where nested tuples as used as inputs/outputs of the
+        // kOptimizationBarrier instruction.
+        if (absl::c_any_of(
+                inst->shape().tuple_shapes(),
+                [](const Shape& shape) { return shape.IsTuple(); })) {
+          continue;
+        }
         switch (inst->opcode()) {
           case HloOpcode::kReduce:
           case HloOpcode::kCustomCall:
@@ -2342,7 +2364,7 @@ Status SetHloShardingPostProcessing(
               }
             }
             FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
-                inst, dst_shardings, device_mesh, preserve_shardings);
+                inst, dst_shardings, device_mesh);
             break;
           }
 
@@ -2595,6 +2617,7 @@ void CheckUserShardingPreservation(
                    << preserve_shardings.at(inst->name())[0].ToString()
                    << "\nbut it's empty.";
       } else if (!inst->sharding().IsTuple() &&
+                 !preserve_shardings.at(inst->name())[0].IsUnknown() &&
                  preserve_shardings.at(inst->name())[0] != inst->sharding()) {
         LOG(FATAL) << "User sharding is not preserved! Instruction with name "
                    << inst->name() << " should be: "
@@ -2604,8 +2627,9 @@ void CheckUserShardingPreservation(
         const std::vector<HloSharding>* preserve_shardings_tuple =
             &preserve_shardings.at(inst->name());
         for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
-          if (preserve_shardings_tuple->at(i) !=
-              inst->sharding().tuple_elements().at(i)) {
+          if (!preserve_shardings_tuple->at(i).IsUnknown() &&
+              preserve_shardings_tuple->at(i) !=
+                  inst->sharding().tuple_elements().at(i)) {
             LOG(FATAL) << "Tuple sharding is not preserved! Instruction "
                           "with name "
                        << inst->name() << " " << i << "th tuple element "
@@ -2620,11 +2644,12 @@ void CheckUserShardingPreservation(
   }
 }
 
-int64_t MemoryBudgetLowerBound(const HloModule& module,
-                               const LivenessSet& liveness_set,
-                               const HloAliasAnalysis* alias_analysis,
-                               const int64_t num_devices) {
-  auto get_value_sharding = [](const HloValue* value) {
+int64_t MemoryBudgetLowerBound(
+    const HloModule& module, const LivenessSet& liveness_set,
+    const HloAliasAnalysis& alias_analysis, const int64_t num_devices,
+    const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserved_shardings) {
+  auto get_value_sharding = [](const HloValue* value) -> HloSharding {
     return !value->index().empty()
                ? value->instruction()->sharding().GetSubSharding(
                      value->instruction()->shape(), value->index())
@@ -2637,24 +2662,28 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
   // as aliasing HloValues are mapped to the same buffer.
   absl::flat_hash_map<HloBuffer::Id, const HloValue*>
       buffer_to_sharded_value_mapping;
-  for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
-    for (const HloValue* value : liveness_set[time_idx]) {
-      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
+  bool vlog_is_on_5 = VLOG_IS_ON(5);
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    for (const HloValue* value : buffer.values()) {
       if (value->instruction()->has_sharding()) {
-        auto this_value_sharding = get_value_sharding(value);
-        auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
-        if (iter != buffer_to_sharded_value_mapping.end()) {
-          auto buffer_value_sharding = get_value_sharding(iter->second);
-          if (this_value_sharding != buffer_value_sharding) {
-            // TODO(pratikf): This is an unavoidable situation, but possibly
-            // there is a better design decision that can be made here.
-            VLOG(1) << "We have a situation where two HloValues alias, but "
-                       "they have different shardings. This can happen in the "
-                       "presence of user-specified shardings, and is expected. "
-                       "This, however, means that the memory budget estimate "
-                       "is not very accurate. The aliasing HLOs are "
-                    << value->ToShortString() << " and "
-                    << iter->second->ToShortString();
+        if (vlog_is_on_5) {
+          const HloSharding& this_value_sharding = get_value_sharding(value);
+          auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+          if (iter != buffer_to_sharded_value_mapping.end()) {
+            const HloSharding& buffer_value_sharding =
+                get_value_sharding(iter->second);
+            if (this_value_sharding != buffer_value_sharding) {
+              // TODO(pratikf): This is an unavoidable situation, but possibly
+              // there is a better design decision that can be made here.
+              VLOG(1)
+                  << "We have a situation where two HloValues alias, but "
+                     "they have different shardings. This can happen in the "
+                     "presence of user-specified shardings, and is expected. "
+                     "This, however, means that the memory budget estimate "
+                     "is not very accurate. The aliasing HLOs are "
+                  << value->ToShortString() << " and "
+                  << iter->second->ToShortString();
+            }
           }
         }
         buffer_to_sharded_value_mapping[buffer.id()] = value;
@@ -2663,25 +2692,51 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
   }
 
   int64_t max_memory_usage = 0;
+  absl::flat_hash_map<const HloValue*, int64_t> value_to_memory_size_mapping;
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
     int64_t memory_usage = 0;
     for (const HloValue* value : liveness_set[time_idx]) {
       if (value->instruction()->shape().IsTuple() && value->index().empty()) {
         continue;
       }
-      Shape shape =
-          ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
-      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
-      auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+
+      auto iter1 = value_to_memory_size_mapping.find(value);
+      if (iter1 != value_to_memory_size_mapping.end()) {
+        memory_usage += iter1->second;
+        continue;
+      }
+
       std::optional<HloSharding> optional_sharding = std::nullopt;
-      if (iter != buffer_to_sharded_value_mapping.end()) {
-        optional_sharding = get_value_sharding(iter->second);
+      const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(*value);
+      auto iter2 = buffer_to_sharded_value_mapping.find(buffer.id());
+      if (iter2 != buffer_to_sharded_value_mapping.end()) {
+        // The instructions here can have partial sharding annotations from
+        // previous iterations with partial mesh shapes when
+        // solve_nd_sharding_iteratively is true. To exclude these, we only
+        // utilize those shardings which corresponding to the current device
+        // mesh.
+        if (preserved_shardings.find(value->instruction()->name()) !=
+            preserved_shardings.end()) {
+          optional_sharding = get_value_sharding(iter2->second);
+        } else {
+          const HloSharding& value_sharding = get_value_sharding(iter2->second);
+          if (!value_sharding.IsTiled() ||
+              value_sharding.TotalNumTiles() == num_devices) {
+            optional_sharding = value_sharding;
+          }
+        }
       }
-      memory_usage +=
+
+      const Shape& shape =
+          ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
+      int64_t value_memory_usage =
           GetShardedInstructionSize(shape, num_devices, optional_sharding);
+      value_to_memory_size_mapping[value] = value_memory_usage;
+      memory_usage += value_memory_usage;
     }
     max_memory_usage = std::max(max_memory_usage, memory_usage);
   }
+
   return max_memory_usage;
 }
 
@@ -3399,6 +3454,14 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
                                          /* save_for_copy_users */ false,
                                          preserve_shardings);
       }
+      if (inst->has_sharding() &&
+          spmd::IsShardingMisaligned(inst->sharding(), inst->shape())) {
+        LOG(WARNING)
+            << "Instruction " << inst->name()
+            << " has a user sharding annotation that is misaligned. Shape: "
+            << inst->shape().ToString()
+            << ". Sharding:" << inst->sharding().ToString();
+      }
     }
   }
 
@@ -3566,9 +3629,9 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
   absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
       buffer_live_ranges = hlo_live_range->buffer_live_ranges();
   spmd::LivenessSet liveness_set(hlo_live_range->schedule_end_time() + 1);
-  for (const auto& iter : buffer_live_ranges) {
-    for (spmd::LivenessIdx i = iter.second.start; i <= iter.second.end; ++i) {
-      liveness_set[i].push_back(iter.first);
+  for (const auto& [hlo_value, live_range] : buffer_live_ranges) {
+    for (spmd::LivenessIdx i = live_range.start; i <= live_range.end; ++i) {
+      liveness_set[i].push_back(hlo_value);
     }
   }
   VLOG(10) << hlo_live_range->ToString();
@@ -3607,7 +3670,6 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       .shape_size = [](const Shape& shape) { return spmd::GetBytes(shape); }};
   HloCostAnalysis hlo_cost_analysis(hlo_cost_analysis_options);
   CHECK_OK(module->entry_computation()->Accept(&hlo_cost_analysis));
-
   for (size_t mesh_idx = 0; mesh_idx < partial_mesh_shapes.size(); ++mesh_idx) {
     // Adjust existing shardings with current partial mesh shapes.
     std::vector<int64_t> mesh_shape = partial_mesh_shapes[mesh_idx];
@@ -3631,9 +3693,16 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
         return changed.status();
       }
     }
-    std::vector<int64_t> device_mesh_ids = std::vector<int64_t>(total_devices);
-    std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
-    device_mesh.SetValues(device_mesh_ids);
+    if (option_.device_mesh_ids.size() == total_devices) {
+      // It is unclear what device order to use for partial meshes. So we only
+      // use the actual device order only for the final full mesh.
+      device_mesh.SetValues(option_.device_mesh_ids);
+    } else {
+      std::vector<int64_t> device_mesh_ids =
+          std::vector<int64_t>(total_devices);
+      std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
+      device_mesh.SetValues(device_mesh_ids);
+    }
 
     // TODO (zhuohan): Include the prof result as an option.
     spmd::ProfilingResult prof_result;
@@ -3643,8 +3712,8 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
 
     XLA_VLOG_LINES(6, module->ToString());
     const int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
-        *module, liveness_set, alias_analysis.get(),
-        device_mesh.num_elements());
+        *module, liveness_set, *alias_analysis, device_mesh.num_elements(),
+        preserve_shardings);
     const float memory_lower_bound_gb =
         static_cast<float>(memory_lower_bound) / (1024 * 1024 * 1024);
     LOG(INFO) << "Memory consumption lower bound is " << memory_lower_bound_gb
@@ -3712,7 +3781,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     std::vector<absl::flat_hash_set<spmd::EdgeIdx>> node_to_edges(
         strategy_groups.size());
     spmd::EdgeIdx edge_idx = 0;
-    for (const auto& [edge, _] : cost_graph.edge_communication_costs_) {
+    for (const auto& [edge, _] : cost_graph.edge_costs_) {
       node_to_edges[edge.second].insert(edge_idx);
       ++edge_idx;
     }
@@ -3761,7 +3830,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     XLA_VLOG_LINES(5, PrintAutoShardingSolution(sequence, liveness_set,
                                                 strategy_map, strategy_groups,
                                                 cost_graph, s_val, objective));
-    XLA_VLOG_LINES(1, PrintSolutionMemoryUsage(liveness_set, strategy_map,
+    XLA_VLOG_LINES(6, PrintSolutionMemoryUsage(liveness_set, strategy_map,
                                                cost_graph, s_val));
 
     // ----- Substitute all-reduce with reduce-scatter -----
@@ -3776,7 +3845,7 @@ absl::StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
       if (!SetHloShardingPostProcessing(
                sequence, strategy_map, cost_graph, s_val, cluster_env,
                /* crash_at_error */ !option_.try_multiple_mesh_shapes,
-               &preserve_shardings)
+               preserve_shardings)
                .ok()) {
         return AutoShardingResult::kModuleUnchanged;
       }
@@ -3817,9 +3886,6 @@ bool ModuleHasUserShardings(const HloModule* module) {
   return has_shardings;
 }
 
-AutoSharding::AutoSharding(const AutoShardingOption& option)
-    : option_(option) {}
-
 bool IsSmallTensor(const HloInstruction* ins,
                    const AutoShardingOption& option) {
   return spmd::GetInstructionSize(ins->shape()) <=
@@ -3857,6 +3923,9 @@ std::unique_ptr<HloModule> CloneModule(const HloModule* module) {
   return module_clone;
 }
 
+AutoSharding::AutoSharding(const AutoShardingOption& option)
+    : option_(option) {}
+
 absl::StatusOr<bool> AutoSharding::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -3931,13 +4000,33 @@ absl::StatusOr<bool> AutoSharding::Run(
     mesh_shapes.push_back(option_.device_mesh_shape);
   }
 
+  HloInstruction* parameter_instruction =
+      module->entry_computation()->parameter_instruction(0);
+  if (parameter_instruction->shape().IsTuple() &&
+      parameter_instruction->has_sharding()) {
+    CHECK_EQ(module->entry_computation()->num_parameters(), 1);
+    parameter_instruction->set_sharding(
+        spmd::ReplaceGivenShardingsWithUnknownForTuple(
+            parameter_instruction->sharding(), parameter_instruction->shape(),
+            module->config().allow_spmd_sharding_propagation_to_parameters()));
+  }
+
+  HloInstruction* root_instruction =
+      module->entry_computation()->root_instruction();
+  if (root_instruction->shape().IsTuple() && root_instruction->has_sharding()) {
+    root_instruction->set_sharding(
+        spmd::ReplaceGivenShardingsWithUnknownForTuple(
+            root_instruction->sharding(), root_instruction->shape(),
+            module->config().allow_spmd_sharding_propagation_to_output()));
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*>
       sharding_propagation_solution;
   std::unique_ptr<HloModule> module_with_default_solution = nullptr;
   if (option_.use_sharding_propagation_for_default_shardings) {
     module_with_default_solution = CloneModule(module);
-    // TODO(pratikf): Ensure that we're passing the correct custom call sharding
-    // helper to the sharding propagation pass.
+    // TODO(pratikf): Ensure that we're passing the correct custom call
+    // sharding helper to the sharding propagation pass.
     auto sharding_prop = ShardingPropagation(
         /*is_spmd */ true, /*propagate_metadata */ false,
         /*allow_spmd_sharding_propagation_to_output*/
@@ -4011,16 +4100,7 @@ absl::StatusOr<bool> AutoSharding::Run(
 
   absl::StatusOr<bool> module_is_changed;
   if (skip_auto_sharding) {
-    VLOG(1) << "Solver timed out. Will now rely on sharding propagation to "
-               "perform sharding.";
-    if (!ModuleHasUserShardings(module)) {
-      LOG(WARNING)
-          << "The auto-sharding solver has timed out without a solution. "
-             "Further, as the input module does not contain any sharding "
-             "annotations, we cannot rely on sharding propagation to perform "
-             "heuristic-guided sharding. The module therefore may not be "
-             "sharded leading to low performance.";
-    }
+    LOG(FATAL) << "The auto-sharding solver has timed out without a solution.";
     module_is_changed = false;
   } else {
     std::string trying_to_find;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 61d38b293bdd75..4a0dbe0902fe9c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -40,11 +40,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/service/call_graph.h"
+#include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/shape.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 
 namespace xla {
 
@@ -385,6 +385,15 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                      const HloCostAnalysis& hlo_cost_analysis,
                      bool trying_multiple_mesh_shapes);
 
+// Computes an approximate lower bound on the per-device memory usage of a
+// module once it has been sharded. This quantity is multiplied with
+// memory_budget_ratio to obtain the memory budget using in our ILP formulation.
+int64_t MemoryBudgetLowerBound(
+    const HloModule& module, const LivenessSet& liveness_set,
+    const HloAliasAnalysis& alias_analysis, int64_t num_devices,
+    const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+        preserved_shardings);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
index 5db37cc868bc2f..1156e0b80c3027 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.cc
@@ -51,22 +51,16 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
       if (!in_nodes[i]->is_tuple) {
         NodeIdx src_idx = in_nodes[i]->node_idx;
         NodeIdx dst_idx = strategy_group->node_idx;
-        Matrix edge_communication_cost =
-            CreateEdgeCommunicationCost(src_idx, dst_idx, i, strategy_group);
-        Matrix edge_memory_cost =
-            CreateEdgeMemoryCost(src_idx, dst_idx, i, strategy_group);
-        AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                    edge_memory_cost);
+        EdgeReshardingCostMatrix edge_cost =
+            CreateEdgeCost(src_idx, dst_idx, i, strategy_group);
+        AddEdgeCost(src_idx, dst_idx, edge_cost);
       } else if (in_nodes[i]->is_tuple && in_nodes.size() > 1) {
         for (size_t l = 0; l < in_nodes[i]->childs.size(); ++l) {
           NodeIdx src_idx = in_nodes[i]->childs[l]->node_idx;
           NodeIdx dst_idx = strategy_group->node_idx;
-          Matrix edge_communication_cost = CreateEdgeCommunicationCost(
-              src_idx, dst_idx, i, strategy_group, true);
-          Matrix edge_memory_cost =
-              CreateEdgeMemoryCost(src_idx, dst_idx, i, strategy_group, true);
-          AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                      edge_memory_cost);
+          EdgeReshardingCostMatrix edge_cost =
+              CreateEdgeCost(src_idx, dst_idx, i, strategy_group, true);
+          AddEdgeCost(src_idx, dst_idx, edge_cost);
         }
       } else {
         CHECK_EQ(in_nodes.size(), 1)
@@ -80,12 +74,9 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
           // operands. If there is only one operand and it's a tuple, the
           // first index of communication_resharding_costs is for the tuple
           // element.
-          Matrix edge_communication_cost = CreateEdgeCommunicationCost(
+          EdgeReshardingCostMatrix edge_cost = CreateEdgeCost(
               src_idx, dst_idx, /*in_node_idx=*/l, strategy_group);
-          Matrix edge_memory_cost = CreateEdgeMemoryCost(
-              src_idx, dst_idx, /*in_node_idx=*/l, strategy_group);
-          AddEdgeCost(src_idx, dst_idx, edge_communication_cost,
-                      edge_memory_cost);
+          AddEdgeCost(src_idx, dst_idx, edge_cost);
         }
       }
     }
@@ -110,8 +101,8 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
     NodeIdx src_idx = pair.first->node_idx;
     NodeIdx dst_idx = pair.second->node_idx;
 
-    Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
-    Matrix edge_memory_cost(node_lens_[src_idx], node_lens_[dst_idx]);
+    EdgeReshardingCostMatrix edge_cost(node_lens_[src_idx],
+                                       node_lens_[dst_idx]);
     absl::flat_hash_map<std::string, NodeStrategyIdx>
         src_strategy_name_to_idx_map;
     for (NodeStrategyIdx i = 0; i < node_lens_[src_idx]; ++i) {
@@ -132,49 +123,21 @@ CostGraph::CostGraph(const StrategyGroups& strategy_groups,
           CHECK_LE(std::abs(src_strategy.communication_cost -
                             dst_strategy.communication_cost),
                    1e-6);
-          edge_communication_cost(it->second, i) =
+          edge_cost(it->second, i).communication_cost =
               -src_strategy.communication_cost;
         }
       }
     }
-    AddEdgeCost(src_idx, dst_idx, edge_communication_cost, edge_memory_cost);
-  }
-}
-
-Matrix CostGraph::CreateEdgeCommunicationCost(const NodeIdx src_idx,
-                                              const NodeIdx dst_idx,
-                                              const size_t in_node_idx,
-                                              StrategyGroup* strategy_group,
-                                              const bool zero_cost) {
-  CHECK_LT(src_idx, node_lens_.size());
-  CHECK_LT(dst_idx, node_lens_.size());
-  Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
-  for (NodeStrategyIdx k = 0; k < strategy_group->strategies.size(); ++k) {
-    const ShardingStrategy& strategy = strategy_group->strategies[k];
-    size_t start_idx = 0;
-    if (strategy.communication_resharding_costs[in_node_idx].size() >
-        node_lens_[src_idx]) {
-      start_idx = strategy.communication_resharding_costs[in_node_idx].size() -
-                  node_lens_[src_idx];
-    }
-    for (size_t j = start_idx;
-         j < strategy.communication_resharding_costs[in_node_idx].size(); ++j) {
-      edge_communication_cost(j - start_idx, k) =
-          zero_cost ? 0
-                    : strategy.communication_resharding_costs[in_node_idx][j];
-    }
+    AddEdgeCost(src_idx, dst_idx, edge_cost);
   }
-  return edge_communication_cost;
 }
 
-Matrix CostGraph::CreateEdgeMemoryCost(const NodeIdx src_idx,
-                                       const NodeIdx dst_idx,
-                                       const size_t in_node_idx,
-                                       StrategyGroup* strategy_group,
-                                       const bool zero_cost) {
+EdgeReshardingCostMatrix CostGraph::CreateEdgeCost(
+    const NodeIdx src_idx, const NodeIdx dst_idx, const size_t in_node_idx,
+    StrategyGroup* strategy_group, const bool zero_cost) {
   CHECK_LT(src_idx, node_lens_.size());
   CHECK_LT(dst_idx, node_lens_.size());
-  Matrix edge_communication_cost(node_lens_[src_idx], node_lens_[dst_idx]);
+  EdgeReshardingCostMatrix edge_cost(node_lens_[src_idx], node_lens_[dst_idx]);
   for (NodeStrategyIdx k = 0; k < strategy_group->strategies.size(); ++k) {
     const ShardingStrategy& strategy = strategy_group->strategies[k];
     size_t start_idx = 0;
@@ -187,46 +150,43 @@ Matrix CostGraph::CreateEdgeMemoryCost(const NodeIdx src_idx,
     }
     for (size_t j = start_idx;
          j < strategy.memory_resharding_costs[in_node_idx].size(); ++j) {
-      edge_communication_cost(j - start_idx, k) =
-          zero_cost ? 0 : strategy.memory_resharding_costs[in_node_idx][j];
+      double communication_cost = 0;
+      double memory_cost = 0;
+      if (!zero_cost) {
+        communication_cost =
+            strategy.communication_resharding_costs[in_node_idx][j];
+        memory_cost = strategy.memory_resharding_costs[in_node_idx][j];
+      }
+      edge_cost(j - start_idx, k) =
+          EdgeReshardingCost(communication_cost, memory_cost);
     }
   }
-  return edge_communication_cost;
+  return edge_cost;
 }
 
-Matrix CostGraph::GetEdgeCommunicationCost(const NodeIdx i, const NodeIdx j) {
+EdgeReshardingCostMatrix CostGraph::GetEdgeCost(const NodeIdx i,
+                                                const NodeIdx j) {
   if (i <= j) {
-    return edge_communication_costs_[{i, j}];
+    return edge_costs_[{i, j}];
   }
-  return edge_communication_costs_[{j, i}].Transpose();
+  return edge_costs_[{j, i}].Transpose();
 }
 
-Matrix CostGraph::GetEdgeMemoryCost(const NodeIdx i, const NodeIdx j) {
-  if (i <= j) {
-    return edge_memory_costs_[{i, j}];
-  }
-  return edge_memory_costs_[{j, i}].Transpose();
-}
-
-void CostGraph::AddEdgeCost(NodeIdx i, NodeIdx j, Matrix& cost,
-                            Matrix& memory_cost) {
+void CostGraph::AddEdgeCost(NodeIdx i, NodeIdx j,
+                            EdgeReshardingCostMatrix& cost) {
   if (i > j) {
     std::swap(i, j);
     cost = cost.Transpose();
-    memory_cost = memory_cost.Transpose();
   }
 
-  if (edge_communication_costs_.contains({i, j})) {
+  if (edge_costs_.contains({i, j})) {
     CHECK(adjacency_[i].contains(j));
     CHECK(adjacency_[j].contains(i));
-    edge_communication_costs_[{i, j}] =
-        edge_communication_costs_[{i, j}] + cost;
-    edge_memory_costs_[{i, j}] = edge_memory_costs_[{i, j}] + memory_cost;
+    edge_costs_[{i, j}] = edge_costs_[{i, j}] + cost;
   } else {
     adjacency_[i].insert(j);
     adjacency_[j].insert(i);
-    edge_communication_costs_[{i, j}] = cost;
-    edge_memory_costs_[{i, j}] = memory_cost;
+    edge_costs_[{i, j}] = cost;
   }
 }
 
@@ -237,13 +197,11 @@ void CostGraph::RemoveEdge(NodeIdx i, NodeIdx j) {
 
   CHECK(adjacency_[i].contains(j));
   CHECK(adjacency_[j].contains(i));
-  CHECK(edge_communication_costs_.contains({i, j}));
-  CHECK(edge_memory_costs_.contains({i, j}));
+  CHECK(edge_costs_.contains({i, j}));
 
   adjacency_[i].erase(j);
   adjacency_[j].erase(i);
-  edge_communication_costs_.erase({i, j});
-  edge_memory_costs_.erase({i, j});
+  edge_costs_.erase({i, j});
 }
 
 void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
@@ -253,7 +211,7 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
   CHECK(!merged_to_.contains(dst));
   CHECK_NE(src, dst);
 
-  Matrix edge_communication_cost = GetEdgeCommunicationCost(dst, src);
+  EdgeReshardingCostMatrix edge_cost = GetEdgeCost(dst, src);
 
   std::vector<NodeStrategyIdx> reindexing(node_lens_[dst]);
   if (node_lens_[dst] == node_lens_[src]) {
@@ -277,7 +235,7 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
       // as the last strategy in BuildStrategyAndCost.
       keys.reserve(node_lens_[src]);
       for (NodeStrategyIdx j = 0; j < node_lens_[src]; ++j) {
-        keys.push_back({edge_communication_cost(i, j), -j});
+        keys.push_back({edge_cost(i, j).communication_cost, -j});
       }
 
       std::sort(arange.begin(), arange.end(), [&keys](int l, int r) {
@@ -296,25 +254,19 @@ void CostGraph::MergeNode(const NodeIdx src, const NodeIdx dst) {
   for (const NodeIdx adj : adj_list) {
     if (adj == dst) {
       for (NodeStrategyIdx i = 0; i < node_lens_[dst]; ++i) {
-        extra_node_costs_[dst][i] += edge_communication_cost(i, reindexing[i]);
+        extra_node_costs_[dst][i] +=
+            edge_cost(i, reindexing[i]).communication_cost;
       }
     } else {
-      Matrix added_edge_communication_cost(node_lens_[dst], node_lens_[adj]);
-      Matrix added_edge_memory_cost(node_lens_[dst], node_lens_[adj]);
-      Matrix edge_communication_cost_src_adj =
-          GetEdgeCommunicationCost(src, adj);
-      Matrix edge_memory_cost_src_adj = GetEdgeMemoryCost(src, adj);
-
+      EdgeReshardingCostMatrix added_edge_cost(node_lens_[dst],
+                                               node_lens_[adj]);
+      EdgeReshardingCostMatrix edge_cost_src_adj = GetEdgeCost(src, adj);
       for (NodeStrategyIdx i = 0; i < node_lens_[dst]; ++i) {
         for (NodeStrategyIdx k = 0; k < node_lens_[adj]; ++k) {
-          added_edge_communication_cost(i, k) =
-              edge_communication_cost_src_adj(reindexing[i], k);
-          added_edge_memory_cost(i, k) =
-              edge_memory_cost_src_adj(reindexing[i], k);
+          added_edge_cost(i, k) = edge_cost_src_adj(reindexing[i], k);
         }
       }
-      AddEdgeCost(dst, adj, added_edge_communication_cost,
-                  added_edge_memory_cost);
+      AddEdgeCost(dst, adj, added_edge_cost);
     }
   }
   // Remove edges
@@ -380,7 +332,7 @@ std::string CostGraph::ToString() const {
   }
   absl::StrAppend(&str, "\n");
 
-  for (const auto& iter : edge_communication_costs_) {
+  for (const auto& iter : edge_costs_) {
     absl::StrAppend(&str, "Edge (", iter.first.first, ", ", iter.first.second,
                     "):\n");
     absl::StrAppend(&str, iter.second.ToString(), "\n");
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index 6ef20f764cbca5..08b0bd968b6d4c 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/matrix.h"
@@ -32,6 +33,28 @@ limitations under the License.
 namespace xla {
 namespace spmd {
 
+struct EdgeReshardingCost {
+  double communication_cost = 0;
+  double memory_cost = 0;
+
+  EdgeReshardingCost() : communication_cost(0), memory_cost(0) {}
+
+  EdgeReshardingCost(double communication_cost_, double memory_cost_)
+      : communication_cost(communication_cost_), memory_cost(memory_cost_) {}
+
+  EdgeReshardingCost operator+(const EdgeReshardingCost& other) const {
+    return EdgeReshardingCost(other.communication_cost + communication_cost,
+                              other.memory_cost + memory_cost);
+  }
+
+  std::string ToString() const {
+    return absl::StrCat("{communication_cost=", communication_cost,
+                        ", memory_cost=", memory_cost, "}");
+  }
+};
+
+using EdgeReshardingCostMatrix = Matrix<EdgeReshardingCost>;
+
 // A graph data structure to simplify the edge cost graph. It merges nodes and
 // performs path compression.
 class CostGraph {
@@ -39,20 +62,14 @@ class CostGraph {
   CostGraph(const StrategyGroups& strategy_groups,
             const AssociativeDotPairs& associative_dot_pairs);
 
-  Matrix CreateEdgeCommunicationCost(NodeIdx src_idx, NodeIdx dst_idx,
-                                     size_t in_node_idx,
-                                     StrategyGroup* strategy_group,
-                                     bool zero_cost = false);
-
-  Matrix CreateEdgeMemoryCost(NodeIdx src_idx, NodeIdx dst_idx,
-                              size_t in_node_idx, StrategyGroup* strategy_group,
-                              bool zero_cost = false);
-
-  Matrix GetEdgeCommunicationCost(NodeIdx i, NodeIdx j);
+  EdgeReshardingCostMatrix CreateEdgeCost(NodeIdx src_idx, NodeIdx dst_idx,
+                                          size_t in_node_idx,
+                                          StrategyGroup* strategy_group,
+                                          bool zero_cost = false);
 
-  Matrix GetEdgeMemoryCost(NodeIdx i, NodeIdx j);
+  EdgeReshardingCostMatrix GetEdgeCost(NodeIdx i, NodeIdx j);
 
-  void AddEdgeCost(NodeIdx i, NodeIdx j, Matrix& cost, Matrix& memory_cost);
+  void AddEdgeCost(NodeIdx i, NodeIdx j, EdgeReshardingCostMatrix& cost);
 
   void RemoveEdge(NodeIdx i, NodeIdx j);
 
@@ -90,8 +107,8 @@ class CostGraph {
   std::vector<StableHashSet<int>> adjacency_;
   // The cost matrix between two nodes.
 
-  StableHashMap<std::pair<NodeIdx, NodeIdx>, Matrix> edge_communication_costs_;
-  StableHashMap<std::pair<NodeIdx, NodeIdx>, Matrix> edge_memory_costs_;
+  StableHashMap<std::pair<NodeIdx, NodeIdx>, EdgeReshardingCostMatrix>
+      edge_costs_;
   // The extra node costs introduced by merging nodes.
   std::vector<std::vector<double>> extra_node_costs_;
   // The reindexing vector of the node.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 600a9053f5d3da..618073f9900f8a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -73,6 +73,9 @@ constexpr double kMemoryMultiplier = 1e-6;
 // Any memory terms below this threshold will be dropped (to reduce MIP size).
 constexpr double kTinyTermThreshold = 1e-6;
 
+// Any memory segments differing by this amount are skipped (reduces MIP size).
+constexpr double kSimilarityThreshold = 1e-2;
+
 bool AutoShardingSolverResult::operator==(
     const AutoShardingSolverResult& other) const {
   return status == other.status &&
@@ -213,6 +216,24 @@ AutoShardingSolverRequest ScaleRequest(
   return scaled_request;
 }
 
+double MemoryDifference(
+    const AutoShardingSolverRequest& request,
+    const tsl::protobuf::RepeatedPtrField<AutoShardingSolverRequest_Costs>& c,
+    const absl::flat_hash_set<int64_t>& live_prev,
+    const absl::flat_hash_set<int64_t>& live_curr) {
+  double memory_diff = 0.0;  // How much this segment differs from the last.
+  absl::flat_hash_set<int64_t> live_union;
+  live_union.insert(live_prev.begin(), live_prev.end());
+  live_union.insert(live_curr.begin(), live_curr.end());
+  for (int64_t idx : live_union) {
+    if (!live_prev.contains(idx) || !live_curr.contains(idx)) {
+      memory_diff +=
+          *std::max_element(c.at(idx).costs().begin(), c.at(idx).costs().end());
+    }
+  }
+  return memory_diff;
+}
+
 // Taking an auto-sharding problem (`request`) as an input, calls the OR tools
 // CP-SAT solver and outputs a solution to the input problem.
 //
@@ -276,6 +297,12 @@ AutoShardingSolverRequest ScaleRequest(
 // 3. If request.makespan_coeff is present, the objective additionally includes
 //    a makespan term. This is experimental and turned off by default.
 // 4. request.max_departures is used only for debugging and can be ignored.
+// 5. Note that due to our modeling of XLA's AllReduceReassociate optimization
+//    (more details in CostGraph::CostGraph() in auto_sharding_cost_graph.cc,
+//    and in CreateElementwiseOperatorStrategies() in auto_sharding.cc), there
+//    can be a few (usually < 10) edges in the problem with negative costs. This
+//    is guaranteed to never produce a negative overall cost for the graph,
+//    however.
 AutoShardingSolverResult CallORToolsSolver(
     const AutoShardingSolverRequest& unscaled_request) {
   const AutoShardingSolverRequest& request = ScaleRequest(unscaled_request);
@@ -460,7 +487,27 @@ AutoShardingSolverResult CallORToolsSolver(
   // c.
   if (request.memory_budget() > 0) {
     int tiny_term_count = 0;
+    int segment_similarity_skips = 0;
+    absl::flat_hash_set<int64_t> live_nodes_prev, live_edges_prev;
     for (LivenessIdx time_idx = 0; time_idx < request.live_size(); ++time_idx) {
+      // Decide whether this segment is similar enough to be skipped.
+      absl::flat_hash_set<int64_t> live_nodes_curr, live_edges_curr;
+      const auto& live_nodes = request.live(time_idx).nodes();
+      live_nodes_curr.insert(live_nodes.begin(), live_nodes.end());
+      double memory_diff = MemoryDifference(request, request.memory_costs(),
+                                            live_nodes_prev, live_nodes_curr);
+      if (!request.live_edges().empty() && request.enable_memory_edge_costs()) {
+        const auto& live_edges = request.live_edges(time_idx).edges();
+        live_edges_curr.insert(live_edges.begin(), live_edges.end());
+        memory_diff += MemoryDifference(request, request.memory_edge_costs(),
+                                        live_edges_prev, live_edges_curr);
+      }
+      if (memory_diff < kSimilarityThreshold * request.memory_budget()) {
+        ++segment_similarity_skips;
+        continue;
+      }
+      live_nodes_prev = live_nodes_curr;
+      live_edges_prev = live_edges_curr;
       MPConstraint* constraint =
           solver->MakeRowConstraint(-MPSolver::infinity(), MPSolver::infinity(),
                                     absl::StrCat("mem[", time_idx, "]"));
@@ -506,6 +553,8 @@ AutoShardingSolverResult CallORToolsSolver(
                         (request.memory_budget() - tiny_term_total));
     }
     LOG(INFO) << "Number of tiny terms: " << tiny_term_count;
+    LOG(INFO) << "Skipped " << segment_similarity_skips << " segments out of "
+              << request.live().size() << " due to similarity";
     if (overbudget_var) {
       solver->MutableObjective()->SetCoefficient(
           overbudget_var,
@@ -513,7 +562,8 @@ AutoShardingSolverResult CallORToolsSolver(
     }
     LOG(INFO) << "Minimum memory budget estimate: "
               << MinimumMemoryBudgetRequired(request);
-    LOG(INFO) << "Using memory budget: " << request.memory_budget();
+    LOG(INFO) << "Using memory budget: "
+              << static_cast<double>(request.memory_budget());
   }
 
   // d. specified via "BoolVarArray"
@@ -1040,9 +1090,6 @@ Status ValidateRequest(const AutoShardingSolverRequest& request) {
     const int num_u_strategies = request.computation_costs(u).costs_size();
     const int num_v_strategies = request.computation_costs(v).costs_size();
     CHECK_EQ(num_strategies, num_u_strategies * num_v_strategies);
-    for (EdgeStrategyIdx strategy = 0; strategy < num_strategies; ++strategy) {
-      TF_RET_CHECK(request.resharding_costs(e).costs(strategy) >= 0.0);
-    }
   }
   return OkStatus();
 }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 8dfe3877e3b7b2..a4a5149d0338f6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -277,17 +277,13 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         break;
       }
       case HloOpcode::kBroadcast: {
-        // For an unknown reason, we do not generate partially replicated
-        // strategies for >1D broadcast ops. This can be changed if we find that
-        // our search isn't exhaustive enough for certain ops.
         strategy_group =
             CreateAllStrategiesGroup(
                 ins, ins->shape(), instruction_id, strategy_groups, cluster_env,
                 strategy_map, option, replicated_penalty, batch_dim_map,
                 call_graph, only_allow_divisible,
                 /* create_replicated_strategies */ true,
-                /* create_partially_replicated_strategies */
-                (ins->shape().rank() == 1))
+                /* create_partially_replicated_strategies */ true)
                 .value();
         break;
       }
@@ -393,13 +389,32 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           // Find output shardings.
           switch (opcode) {
             case HloOpcode::kSlice: {
+              // When solve_nd_sharding_iteratively is true, in some cases, we
+              // can have 1D shardings where the total number of tiles is larger
+              // than the number of elements in the partial mesh (and is
+              // actually equal to the number of devices in the original
+              // mesh). Below, we use the correct mesh depending on the number
+              // of elements in the 1D sharding.
               bool is_1d_sharding =
                   VectorGreaterThanOneElementCount(
                       input_spec.tile_assignment().dimensions()) == 1;
-              output_spec = PropagateDimwiseShardingSlice(
-                  input_spec, operand->shape(), ins->shape(),
-                  is_1d_sharding ? cluster_env.device_mesh_1d_
-                                 : cluster_env.device_mesh_);
+              if (is_1d_sharding &&
+                  input_spec.TotalNumTiles() ==
+                      cluster_env.device_mesh_1d_.num_elements()) {
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.device_mesh_1d_);
+              } else if (is_1d_sharding) {
+                CHECK_EQ(input_spec.TotalNumTiles(),
+                         cluster_env.original_device_mesh_1d_.num_elements());
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.original_device_mesh_1d_);
+              } else {
+                output_spec = PropagateDimwiseShardingSlice(
+                    input_spec, operand->shape(), ins->shape(),
+                    cluster_env.device_mesh_);
+              }
               break;
             }
             case HloOpcode::kPad:
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index aa1167dfac33bb..359549ee07a872 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -28,13 +28,19 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/utils/hlo_live_range.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_memory_scheduler.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/service/hlo_value.h"
 #include "xla/statusor.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -59,7 +65,7 @@ using ::testing::UnorderedElementsAre;
 using DummyAutoShardingTest = HloTestBase;
 
 TEST_F(DummyAutoShardingTest, ReplicatedShardingDummy) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
@@ -68,8 +74,8 @@ ENTRY %elementwise {
   ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, DummyAutoSharding().Run(module.get()));
   EXPECT_TRUE(changed);
   auto* instruction = FindInstruction(module.get(), "param0");
@@ -79,14 +85,14 @@ ENTRY %elementwise {
 
 class AutoShardingTest : public HloTestBase {
  protected:
-  absl::string_view dot_hlo_string_ = R"(
+  const absl::string_view kDotHloString = R"(
 HloModule module
 ENTRY matmul {
   parameter.1 = f32[32,64]{1,0} parameter(0)
   parameter.2 = f32[64,128]{1,0} parameter(1)
   ROOT root = f32[32,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  absl::string_view add_hlo_string_ = R"(
+  const absl::string_view kAddHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[16,32,64]{2,1,0} parameter(0)
@@ -96,8 +102,8 @@ ENTRY %elementwise {
   void RunMatMulAutoShardingWithOptions(
       AutoShardingOption option, size_t expected_num_tiles,
       size_t expected_sharded_dimensions = 1) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
   }
@@ -105,8 +111,8 @@ ENTRY %elementwise {
   void RunAddAutoShardingWithOptions(AutoShardingOption option,
                                      size_t expected_num_tiles,
                                      size_t expected_sharded_dimensions = 1) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(add_hlo_string_));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                            ParseAndReturnVerifiedModule(kAddHloString));
     RunAutoShardingWithOptions(module.get(), option, expected_num_tiles,
                                expected_sharded_dimensions);
   }
@@ -127,8 +133,8 @@ ENTRY %elementwise {
   }
 
   void RunMatMulAutoShardingWithOptionsExpectFail(AutoShardingOption option) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsExpectFail(module.get(), option);
   }
 
@@ -140,8 +146,8 @@ ENTRY %elementwise {
   void RunMatMulAutoShardingWithOptionsNoDeviceIds(
       AutoShardingOption option, std::vector<int64_t> expected_tile,
       bool expeted_last_dim_replicate = false) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(dot_hlo_string_));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                            ParseAndReturnVerifiedModule(kDotHloString));
     RunAutoShardingWithOptionsNoDeviceIds(module.get(), option, expected_tile,
                                           expeted_last_dim_replicate);
   }
@@ -162,8 +168,70 @@ ENTRY %elementwise {
   }
 };
 
+TEST_F(AutoShardingTest, MemoryBudgetTest) {
+  auto compute_memory_budget_lower_bound =
+      [](const HloModule& module, int64_t num_devices,
+         const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
+             preserved_shardings = {}) -> absl::StatusOr<int64_t> {
+    auto size_fn = [](const BufferValue& buffer) {
+      return spmd::GetBytes(buffer.shape());
+    };
+    TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                        ScheduleModule(&module, size_fn,
+                                       ComputationSchedulerToModuleScheduler(
+                                           DFSMemoryScheduler),
+                                       /* execution_threads */ {}));
+    const HloComputation* entry_computation = module.entry_computation();
+    std::unique_ptr<HloAliasAnalysis> alias_analysis =
+        HloAliasAnalysis::Run(&module).value();
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloLiveRange> hlo_live_range,
+        HloLiveRange::Run(schedule, *alias_analysis, entry_computation));
+    absl::flat_hash_map<const HloValue*, HloLiveRange::TimeBound>&
+        buffer_live_ranges = hlo_live_range->buffer_live_ranges();
+    spmd::LivenessSet liveness_set(hlo_live_range->schedule_end_time() + 1);
+    for (const auto& [hlo_value, live_range] : buffer_live_ranges) {
+      for (spmd::LivenessIdx i = live_range.start; i <= live_range.end; ++i) {
+        liveness_set[i].push_back(hlo_value);
+      }
+    }
+    return spmd::MemoryBudgetLowerBound(module, liveness_set, *alias_analysis,
+                                        num_devices, preserved_shardings);
+  };
+
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[16384,16384]{0,1} parameter(0)
+  %param1 = f32[16384,16384]{0,1} parameter(1)
+  %add = f32[16384,16384]{0,1} add(%param0, %param1)
+  ROOT %copy = f32[16384,16384]{0,1} copy(%add)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding partial_sharding,
+                          ParseSharding("{devices=[64,1]<=[64]}"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      int64_t partial_mesh_64x1_budget_lower_bound,
+      compute_memory_budget_lower_bound(*module, /* num_devices */ 64));
+  for (HloInstruction* ins : module->entry_computation()->instructions()) {
+    ins->set_sharding(partial_sharding);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      int64_t full_mesh_64x8_budget_lower_bound,
+      compute_memory_budget_lower_bound(*module, /* num_devices */ 512));
+  CHECK_LT(full_mesh_64x8_budget_lower_bound,
+           partial_mesh_64x1_budget_lower_bound)
+      << "The memory budget lower bound per device should be lower with a "
+         "larger number of devices. Instead, the bound was "
+      << partial_mesh_64x1_budget_lower_bound << " bytes for 64 devices and "
+      << full_mesh_64x8_budget_lower_bound << " bytes for 512 devices.";
+}
+
 TEST_F(AutoShardingTest, DISABLED_ElementWiseOperator) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -172,8 +240,8 @@ ENTRY %elementwise {
   ROOT %copy = f32[128,128]{0,1} copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -189,7 +257,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, Unsupported3DShardingTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[32,32,32,32] parameter(0)
@@ -198,8 +266,8 @@ ENTRY %elementwise {
   ROOT %copy = f32[32,32,32,32] copy(%add)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   // The case of a fleet HLO when run with try_multiple_mesh_shapes = true
@@ -211,7 +279,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, NDIterativeSolveTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -230,7 +298,7 @@ ENTRY %elementwise {
   option.device_mesh_beta = {0.01, 1.0};
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
   VLOG(10) << module->ToString();
   EXPECT_TRUE(changed);
@@ -240,7 +308,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, SliceDeviceMeshTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -250,15 +318,15 @@ ENTRY %elementwise {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+                          ParseAndReturnVerifiedModule(kHloString));
   TF_ASSERT_OK_AND_ASSIGN(
-      bool changed,
-      AutoSharding(/* option */ {.enable = true,
-                                 .solve_nd_sharding_iteratively = true,
-                                 .device_mesh_shape = {2, 2},
-                                 .device_mesh_alpha = {1.0, 1.0},
-                                 .device_mesh_beta = {0.01, 1.0}})
-          .Run(module.get()));
+      bool changed, AutoSharding(/* option */ AutoShardingOption{
+                                     .enable = true,
+                                     .solve_nd_sharding_iteratively = true,
+                                     .device_mesh_shape = {2, 2},
+                                     .device_mesh_alpha = {1.0, 1.0},
+                                     .device_mesh_beta = {0.01, 1.0}})
+                        .Run(module.get()));
   VLOG(10) << module->ToString();
   EXPECT_TRUE(changed);
   const HloInstruction* slice = FindInstruction(module.get(), "slice");
@@ -269,8 +337,73 @@ ENTRY %elementwise {
             op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}")));
 }
 
+TEST_F(AutoShardingTest, SliceMixedUserShardingTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+
+ENTRY %elementwise {
+  param = s32[512,3084]{1,0} parameter(0), sharding={devices=[4,1]0,2,1,3}
+  slice = s32[512,2048]{1,0} slice(param), slice={[0:512], [0:2048]}
+  ROOT copy = s32[512,2048]{1,0} copy(slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      AutoSharding(
+          /* option */ {
+              .enable = true,
+              .preserve_shardings =
+                  AutoShardingOption::PreserveShardingsType::kKeepAllShardings,
+              .solve_nd_sharding_iteratively = true,
+              .device_mesh_shape = {2, 2},
+              .device_mesh_ids = {0, 2, 1, 3},
+              .device_mesh_alpha = {1.0, 1.0},
+              .device_mesh_beta = {0.01, 1.0}})
+          .Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+
+  std::vector<HloInstruction*> instructions =
+      module->entry_computation()->MakeInstructionPostOrder();
+  EXPECT_THAT(instructions,
+              Each(ResultOf(
+                  [](const HloInstruction* ins) { return ins->has_sharding(); },
+                  IsTrue())));
+  EXPECT_THAT(instructions, Each(op::Sharding("{devices=[4,1]0,2,1,3}")));
+}
+
+TEST_F(AutoShardingTest, UserShardingTest) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+
+ENTRY %elementwise {
+  concatenate.76306 = bf16[1,4096,8,256]{3,2,1,0} parameter(0)
+  constant.15158 = bf16[] constant(0)
+  pad.70 = bf16[1,4352,8,256]{3,2,1,0} pad(concatenate.76306, constant.15158), padding=0_0x0_256x0_0x0_0, sharding={devices=[1,1,128,1]<=[128]}
+  ROOT copy.45 = bf16[1,4352,8,256]{3,2,1,0} copy(pad.70)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      AutoSharding(
+          /* option */ AutoShardingOption{
+              .enable = true,
+              .preserve_shardings =
+                  AutoShardingOption::PreserveShardingsType::kKeepAllShardings,
+              .device_mesh_shape = {128, 1},
+              .device_mesh_alpha = {1.0, 1.0},
+              .device_mesh_beta = {0.01, 1.0}})
+          .Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+}
+
 TEST_F(AutoShardingTest, RngBitGeneratorArrayInput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
@@ -278,8 +411,8 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
   ROOT %rand = (u64[2]{0}, u32[16,16]{1,0}) rng-bit-generator(u64[2]{0} %p0), algorithm=rng_three_fry
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -295,7 +428,7 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
 }
 
 TEST_F(AutoShardingTest, RngBitGeneratorTupleInput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator {
@@ -305,8 +438,8 @@ ENTRY %RngBitGenerator {
   ROOT rng-bit-generator = u32[100,100]{1,0:T(8,128)} rng-bit-generator(tuple.3), algorithm=rng_default
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -325,7 +458,7 @@ ENTRY %RngBitGenerator {
 }
 
 TEST_F(AutoShardingTest, DotLHSTwoNonContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -334,8 +467,8 @@ ENTRY %entry {
   ROOT %copy = f32[4,256,32]{2,1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -377,7 +510,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotRHSTwoNonContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,32]{2,1,0} parameter(0)
@@ -386,8 +519,8 @@ ENTRY %entry {
   ROOT %copy = f32[32,4,8]{2,1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -429,7 +562,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotTwoContractingDims) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -438,8 +571,8 @@ ENTRY %entry {
   ROOT %copy = f32[64,32]{1,0} copy(%dot)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -470,7 +603,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TwoMatmul) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY twomatmul {
   parameter.1 = f32[64,64]{1,0} parameter(0)
@@ -480,8 +613,8 @@ ENTRY twomatmul {
   ROOT dot.5 = f32[64,64]{1,0} dot(dot.4, parameter.3), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.allow_recompute_heavy_op = false;
@@ -513,7 +646,7 @@ ENTRY twomatmul {
               op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}"));
 
   // Test with replicated strategies on for dot
-  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(kHloString));
   option.enable = true;
   option.allow_recompute_heavy_op = true;
   option.device_mesh_shape = {2, 2};
@@ -549,7 +682,7 @@ ENTRY twomatmul {
 }
 
 TEST_F(AutoShardingTest, ProcessCustomCallShardings) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -561,8 +694,8 @@ ENTRY %entry {
   %copy.2 = f32[6,3] copy(%annotate)
   ROOT %copy.3 = f32[6,3] copy(%copy.2)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -579,7 +712,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepAll) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -588,8 +721,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
   option.preserve_shardings =
@@ -633,7 +766,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest,
        SaveAndRemoveShardingAnnotationKeepInputOutputSmallTensor) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -642,8 +775,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={replicated}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Keep all user shardings
   option.preserve_shardings =
@@ -683,7 +816,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepInputOutput) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -694,8 +827,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0_copy, f32[4,256,32]{2,1,0} %param1_copy), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepInputOutputShardings;
@@ -761,7 +894,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAll) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -773,8 +906,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   rhs_contracting_dims={0,1}, sharding={devices=[2,2]0,1,2,3} ROOT %copy =
   f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
   option.preserve_shardings =
@@ -798,7 +931,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAllSmallTensor) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -807,8 +940,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   %dot = f32[64,32]{1,0} dot(f32[4,256,64]{2,1,0} %param0, f32[4,256,32]{2,1,0} %param1), lhs_contracting_dims={0,1}, rhs_contracting_dims={0,1}, sharding={replicated}
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   // Remove all user shardings
   option.preserve_shardings =
@@ -851,7 +984,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, TupleReduceTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -874,8 +1007,8 @@ ENTRY %entry {
   %constant.b = s32[] constant(0)
   %reduce = (f32[1,16]{1,0}, s32[1,16]{1,0}) reduce(f32[1,16,40]{2,1,0} %param0, s32[1,16,40]{2,1,0} %iota, f32[] %constant.a, s32[] %constant.b), dimensions={2}, to_apply=%func
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -897,7 +1030,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReduceTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %func (x: f32[], y: f32[]) -> f32[] {
@@ -911,8 +1044,8 @@ ENTRY %entry {
   %param1 = f32[] parameter(1)
   %reduce = f32[1,16]{1,0} reduce(f32[1,16,128]{2,1,0} %param0, f32[] %param1), dimensions={2}, to_apply=%func
   })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -940,7 +1073,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ScatterTest2D) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 region {
@@ -955,8 +1088,8 @@ ENTRY %Scatter {
   ROOT scatter = s32[4,128]{1,0} scatter(call, clamp, broadcast), update_window_dims={1}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=region
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -979,7 +1112,7 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, ScatterTest3D) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 region {
@@ -994,8 +1127,8 @@ ENTRY %Scatter {
   ROOT scatter = f32[4,128,128]{2,1,0} scatter(call, clamp, multiply), update_window_dims={1,2}, inserted_window_dims={0}, scatter_dims_to_operand_dims={0,1,2}, index_vector_dim=1, indices_are_sorted=true, unique_indices=true, to_apply=region
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1022,15 +1155,15 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, GatherTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[256,1024]{0,1} parameter(0)
   %param1 = s32[128,512,1]{2,1,0} parameter(1)
   ROOT %gather = f32[128,512,1024]{2,1,0} gather(f32[256,1024]{0,1} %param0, s32[128,512,1]{2,1,0} %param1), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,1024}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1053,15 +1186,15 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherTestNoReshard) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   get-tuple-element = s8[1000,128]{1,0} parameter(0)
   reshape = s32[8,1,1]{2,1,0} parameter(1)
   gather = s8[8,1,128]{2,1,0} gather(get-tuple-element, reshape), offset_dims={2}, collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2, slice_sizes={1,128}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {1, 1, 8};
@@ -1084,7 +1217,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherConvTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[1024,1024]{0,1} parameter(0)
@@ -1097,8 +1230,8 @@ ENTRY %entry {
   ROOT convolution = f32[128,1024,1024]{2,1,0} convolution(gather, reshape),
   window={size=1}, dim_labels=b0f_io0->b0f
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {4, 1, 1};
@@ -1383,7 +1516,7 @@ TEST_F(AutoShardingTest, InvalidOptions) {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingInputOutput) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1393,8 +1526,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in dot
   auto* dot = FindInstruction(module.get(), "dot");
   dot->clear_sharding();
@@ -1415,7 +1548,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingAdd) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -1423,8 +1556,8 @@ ENTRY %elementwise {
   %add = f32[128,128]{0,1} add(%param0, %param1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
   ROOT %copy = f32[128,128]{0,1} copy(%add)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   // Run AutoSharding
   AutoShardingOption option;
   option.enable = true;
@@ -1449,7 +1582,7 @@ ENTRY %elementwise {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingDot) {
   // An HLO Module with sharding for all instructions.
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1459,8 +1592,8 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
   ROOT %copy = f32[64,32]{1,0} copy(f32[64,32]{1,0} %dot), sharding={devices=[2,2]0,1,2,3}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   // Remove the sharding in param0, param1 and copy
   auto* param0 = FindInstruction(module.get(), "param0");
   param0->clear_sharding();
@@ -1495,7 +1628,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, DISABLED_AutoShardingKeepUserShardingTupleReduce) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -1519,8 +1652,8 @@ ENTRY %entry {
   %reduce = (f32[1,16]{1,0}, s32[1,16]{1,0}) reduce(f32[1,16,40]{2,1,0} %param0, s32[1,16,40]{2,1,0} %iota, f32[] %constant.a, s32[] %constant.b), dimensions={2}, to_apply=%func,
     sharding={{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}, {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1542,8 +1675,42 @@ ENTRY %entry {
   EXPECT_FALSE(param0->sharding().IsReplicated());
 }
 
+TEST_F(AutoShardingTest, GetTupleElementUserShardingsParameter) {
+  constexpr absl::string_view kHloString = R"(
+HloModule module
+ENTRY %tupleparameter {
+  %param0 = f32[32,64]{1,0} parameter(0)
+  %param1 = f32[32,64]{1,0} parameter(1), sharding={devices=[2,2]<=[4]}
+  %tuple1 = (f32[32,64]{1,0}, f32[32,64]{1,0}) tuple(f32[32,64]{1,0} %param0, f32[32,64]{1,0} %param1)
+  %first = f32[32,64]{1,0} get-tuple-element((f32[32,64]{1,0}, f32[32,64]{1,0}) %tuple1), index=0
+  %second = f32[32,64]{1,0} get-tuple-element((f32[32,64]{1,0}, f32[32,64]{1,0}) %tuple1), index=1, sharding={devices=[4,1]<=[4]}
+  ROOT root = f32[32,64]{1,0} add(%first, %second)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  AutoShardingOption option;
+  option.enable = true;
+  option.preserve_shardings =
+      AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
+  option.device_mesh_shape = {2, 2};
+  option.device_mesh_ids = {0, 1, 2, 3};
+  option.device_mesh_alpha = {1.0, 1.0};
+  option.device_mesh_beta = {0.01, 1.0};
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, AutoSharding(option).Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  const HloInstruction* param1 = FindInstruction(module.get(), "param1");
+  ASSERT_NE(param1, nullptr);
+  EXPECT_THAT(param1, op::Sharding("{devices=[2,2]<=[4]}"));
+
+  const HloInstruction* second = FindInstruction(module.get(), "root");
+  ASSERT_NE(second, nullptr);
+  EXPECT_THAT(second, op::Sharding("{devices=[4,1]<=[4]}"));
+}
+
 TEST_F(AutoShardingTest, DISABLED_TupleParameter) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %tupleparameter {
   %tuple_param = (f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) parameter(0)
@@ -1552,8 +1719,8 @@ ENTRY %tupleparameter {
   ROOT root = f32[16,32,64]{2,1,0} add(%first, %second)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1573,7 +1740,7 @@ ENTRY %tupleparameter {
 
 // CRASHES
 TEST_F(AutoShardingTest, DISABLED_GetTupleElementWithUserShardingTest) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %while_cond {
@@ -1605,8 +1772,8 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
   %tuple1 = f32[16,256,256]{2,1,0} get-tuple-element((u32[], f32[16,256,256]{2,1,0}, f32[16,256,256]{2,1,0}) %while.1), index=1, sharding={devices=[2,2,1]0,2,1,3}
   ROOT %tanh = f32[16,256,256]{2,1,0} tanh(f32[16,256,256]{2,1,0} %tuple1)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.preserve_shardings =
       AutoShardingOption::PreserveShardingsType::kKeepAllShardings;
@@ -1620,7 +1787,7 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
 }
 
 TEST_F(AutoShardingTest, While) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 %cond {
@@ -1656,8 +1823,8 @@ ENTRY %entry {
   ROOT %result = bf16[128,512,768] get-tuple-element(%while), index=3
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1700,7 +1867,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DynamicSlice) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 ENTRY %entry {
   %param0 = s32[] parameter(0)
@@ -1715,8 +1882,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1729,7 +1896,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Alias) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1741,8 +1908,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1755,7 +1922,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AliasTupleParameter) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {0}, may-alias), {1}: (0, {1}, may-alias), {2}: (0, {2}, may-alias), {3}: (0, {3}, may-alias)}
 
 ENTRY %entry {
@@ -1768,8 +1935,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1782,7 +1949,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, JaxRandomUniform) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 clone {
   lhs.1 = u32[] parameter(0)
@@ -1815,8 +1982,8 @@ ENTRY %entry {
   ROOT maximum = f32[8,512]{1,0} maximum(subtract, broadcast.d)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1833,7 +2000,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Reshape) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1846,8 +2013,8 @@ ENTRY %entry {
   %dot = bf16[512,1024,16,128]{3,2,1,0} dot(bf16[512,1024,2048]{2,1,0} %param.2, bf16[2048,16,128]{2,1,0} %reshape), lhs_contracting_dims={2}, rhs_contracting_dims={0}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {64, 1};
@@ -1861,7 +2028,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReshapeWithInvalidUserSharding) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1870,8 +2037,8 @@ ENTRY %entry {
   %copy = bf16[1,24,16,16]{3,2,1,0} copy(%reshape)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {32, 1};
@@ -1887,15 +2054,15 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Broadcast) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
   %param.0 = s32[32]{0} parameter(0)
   ROOT broadcast = s32[512,1024,1024,32]{3,2,1,0} broadcast(s32[32]{0} %param.0), dimensions={3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {1, 1, 64};
@@ -1906,7 +2073,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TestReshardingCostsForUserAnnotatedSharding) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1915,8 +2082,8 @@ ENTRY %entry {
   %dot = f32[256,256] dot(%param0, %param1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
   ROOT %result = f32[256,256] tanh(%dot), sharding={devices=[1,4]0,1,2,3}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1932,7 +2099,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AllowAliasToFollowerConversion) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1944,8 +2111,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
@@ -1959,7 +2126,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DisallowAliasToFollowerConversion) {
-  constexpr absl::string_view hlo_string = R"(
+  constexpr absl::string_view kHloString = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1971,8 +2138,8 @@ ENTRY %entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
   AutoShardingOption option;
   option.enable = true;
   option.device_mesh_shape = {2, 2};
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index f2cf5dab656ecd..baa827febc909a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -1351,9 +1351,7 @@ HloInstruction* ReshardTensor(HloInstruction* tensor,
 void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_shardings,
-    const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
-        preserve_shardings) {
+    const Array<int64_t>& device_mesh) {
   size_t tuple_size = inst->shape().tuple_shapes_size();
   auto current_sharding = inst->sharding();
 
@@ -1414,7 +1412,7 @@ void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
 void FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings) {
   HloInstruction* operand = inst->mutable_operand(0);
   auto input_tuple_sharding = operand->sharding();
@@ -1444,11 +1442,11 @@ void FixMixedMeshShapeReshardingGetTupleElement(
     TF_CHECK_OK(inst->ReplaceUseWith(user, replace_with));
   }
 
-  CHECK_NE(preserve_shardings, nullptr);
-  if (preserve_shardings->contains(inst->name())) {
-    (*preserve_shardings)[replace_with->name()] =
-        preserve_shardings->at(inst->name());
-    preserve_shardings->erase(inst->name());
+  auto iter = preserve_shardings.find(inst->name());
+  if (iter != preserve_shardings.end()) {
+    preserve_shardings[replace_with->name()] =
+        std::vector<HloSharding>(iter->second);
+    preserve_shardings.erase(inst->name());
   }
 }
 
@@ -1880,6 +1878,9 @@ int64_t GetInstructionSize(const Shape& shape) {
 
 int64_t GetShardedInstructionSize(const Shape& shape, int64_t num_devices,
                                   std::optional<HloSharding> sharding) {
+  if (sharding && sharding->IsUnknown()) {
+    sharding = HloSharding::Replicate();
+  }
   if (shape.IsTuple()) {
     int64_t size = 0;
     for (size_t i = 0; i < shape.tuple_shapes_size(); i++) {
@@ -2031,6 +2032,10 @@ absl::StatusOr<bool> AdjustShardingsWithPartialMeshShape(
       for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
         auto shape = inst->shape().tuple_shapes(i);
         auto sharding = inst->sharding().tuple_elements()[i];
+        if (sharding.IsUnknown()) {
+          output_flattened_shardings.push_back(sharding);
+          continue;
+        }
         absl::StatusOr<std::optional<HloSharding>> new_sharding_result =
             AdjustShardingWithPartialMeshShapePerElement(
                 sharding, valid_shards, total_num_devices, crash_on_error);
@@ -2194,29 +2199,6 @@ void EnumerateAllPossibleMeshShapesHelper(
   }
 }
 
-std::vector<std::vector<int64_t>> EnumerateAllPossibleMeshShapes(
-    const int64_t num_devices, int num_mesh_dims, bool symmetrical_mesh_dims) {
-  std::vector<std::vector<int64_t>> result;
-  EnumerateAllPossibleMeshShapesHelper(num_devices, num_mesh_dims, {}, result);
-
-  if (symmetrical_mesh_dims) {
-    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
-    for (const std::vector<int64_t>& mesh_shape : result) {
-      dedup_result.insert(
-          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
-    }
-
-    result.clear();
-
-    for (const absl::btree_multiset<int64_t>& mesh_shape_set : dedup_result) {
-      result.push_back(
-          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
-    }
-  }
-
-  return result;
-}
-
 std::vector<std::vector<int64_t>> InferMeshShapesToTry(
     const HloModule& module) {
   int64_t sharding_1d = -1;
@@ -2275,12 +2257,74 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
     bool symmetrical_mesh_dims) {
   std::vector<std::vector<int64_t>> mesh_shapes = InferMeshShapesToTry(module);
   if (mesh_shapes.empty()) {
-    mesh_shapes = spmd::EnumerateAllPossibleMeshShapes(
-        num_devices, num_mesh_dims,
-        /* symmetrical_mesh_dims */ symmetrical_mesh_dims);
+    EnumerateAllPossibleMeshShapesHelper(num_devices, num_mesh_dims, {},
+                                         mesh_shapes);
+  }
+  if (symmetrical_mesh_dims) {
+    absl::flat_hash_set<absl::btree_multiset<int64_t>> dedup_result;
+    for (const std::vector<int64_t>& mesh_shape : mesh_shapes) {
+      dedup_result.insert(
+          absl::btree_multiset<int64_t>(mesh_shape.begin(), mesh_shape.end()));
+    }
+
+    mesh_shapes.clear();
+
+    for (const absl::btree_multiset<int64_t>& mesh_shape_set : dedup_result) {
+      mesh_shapes.push_back(
+          std::vector<int64_t>(mesh_shape_set.begin(), mesh_shape_set.end()));
+    }
   }
+
   return mesh_shapes;
 }
 
+bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape) {
+  if (shape.IsTuple()) {
+    for (size_t i = 0; i < shape.tuple_shapes_size(); ++i) {
+      if (IsShardingMisaligned(
+              sharding.IsTuple()
+                  ? sharding.GetSubSharding(shape, {static_cast<int64_t>(i)})
+                  : sharding,
+              shape.tuple_shapes(i))) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (sharding.IsReplicated() || sharding.IsManual() || sharding.IsUnknown() ||
+      sharding.IsTileMaximal()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < shape.rank(); ++i) {
+    int64_t shape_dim = shape.dimensions()[i];
+    int64_t sharding_dim = sharding.tile_assignment().dim(i);
+    if (shape_dim % sharding_dim != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+HloSharding ReplaceGivenShardingsWithUnknownForTuple(
+    const HloSharding& sharding, const Shape& shape,
+    absl::Span<const bool> to_replace_sharding_ids) {
+  std::vector<HloSharding> new_tuple_shardings;
+  int64_t num_elements = sharding.tuple_elements().size();
+  for (int32_t i = 0; i < num_elements; ++i) {
+    bool can_change_sharding = to_replace_sharding_ids.size() == 1
+                                   ? to_replace_sharding_ids[0]
+                                   : to_replace_sharding_ids[i];
+    if (can_change_sharding) {
+      new_tuple_shardings.push_back(HloSharding::Unknown());
+    } else {
+      new_tuple_shardings.push_back(sharding.tuple_elements()[i]);
+    }
+  }
+
+  return HloSharding::Tuple(shape, new_tuple_shardings);
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index b3781ce3534f6f..4a129bb159076e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -465,15 +465,13 @@ Shape ComputeIntermediateShape(const HloSharding& src_sharding,
 void FixMixedMeshShapeReshardingGetTupleElement(
     HloInstruction* inst, const HloSharding& dst_sharding,
     const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
+    absl::flat_hash_map<std::string, std::vector<HloSharding>>&
         preserve_shardings);
 
 void FixMixedMeshShapeReshardingGetTupleElementWithTupleOutput(
     HloInstruction* inst,
     const std::vector<std::optional<HloSharding>>& dst_sharding,
-    const Array<int64_t>& device_mesh,
-    absl::flat_hash_map<std::string, std::vector<HloSharding>>*
-        preserve_shardings);
+    const Array<int64_t>& device_mesh);
 
 void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                                  const HloSharding& dst_sharding,
@@ -655,6 +653,17 @@ std::vector<std::vector<int64_t>> InferOrEnumerateMeshShapesToTry(
     const HloModule& module, int64_t num_devices, int num_mesh_dims,
     bool symmetrical_mesh_dims);
 
+// Check if the sharding is "misaligned" wrt the shape. This is true if there is
+// at least one dimension of the tensor that is sharded over a number of devices
+// that do not complete divide the size of the tensor dimension.
+bool IsShardingMisaligned(const HloSharding& sharding, const Shape& shape);
+
+// In a given tuple sharding, replace certain leaves with
+// HloSharding::Unknown()
+HloSharding ReplaceGivenShardingsWithUnknownForTuple(
+    const HloSharding& sharding, const Shape& shape,
+    absl::Span<const bool> to_replace_sharding_ids);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
index 7bab542bdbd2e3..19736d19e25f0a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -51,6 +51,7 @@ class ClusterEnvironment {
         prof_result_(prof_result),
         total_devices_(device_mesh.num_elements()),
         device_mesh_1d_(device_mesh),
+        original_device_mesh_1d_(original_device_mesh),
         auto_sharding_option_(auto_sharding_option) {
     // Build replica group for each dimension.
     non_zero_mesh_dims_ =
@@ -71,6 +72,12 @@ class ClusterEnvironment {
     std::vector<int64_t> device_mesh_1d_shape(device_mesh.num_dimensions(), 1);
     device_mesh_1d_shape[largest_dim_idx] = device_mesh.num_elements();
     device_mesh_1d_.Reshape(device_mesh_1d_shape);
+
+    std::vector<int64_t> original_device_mesh_1d_shape(
+        original_device_mesh.num_dimensions(), 1);
+    original_device_mesh_1d_shape[largest_dim_idx] =
+        original_device_mesh.num_elements();
+    original_device_mesh_1d_.Reshape(original_device_mesh_1d_shape);
   }
 
   size_t NumDevices() const { return total_devices_; }
@@ -171,6 +178,10 @@ class ClusterEnvironment {
   // Used for mixed mesh shape strategies.
   Array<int64_t> device_mesh_1d_;
 
+  // Cache a flatten 1d version of the original device mesh.
+  // Used for mixed mesh shape strategies.
+  Array<int64_t> original_device_mesh_1d_;
+
   // The option may override the cost of communication primitives
   const AutoShardingOption& auto_sharding_option_;
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h b/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
index 40eb8d35887685..903973eea5a3a6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/matrix.h
@@ -36,6 +36,7 @@ namespace spmd {
 // It can create a view for matrix transpose without copying the memory.
 // TODO (zhuohan): Inherit from Array2D and add Transpose and operator+ (See
 // tensorflow/compiler/xla/array2d.h;l=39)
+template <typename T>
 class Matrix {
  public:
   Matrix() : n_(0), m_(0), transpose_(false), data_(nullptr) {}
@@ -44,11 +45,11 @@ class Matrix {
     this->n_ = n;
     this->m_ = m;
     transpose_ = false;
-    data_ = std::make_shared<std::vector<double>>(n * m, 0.0);
+    data_ = std::make_shared<std::vector<T>>(n * m, T());
   }
 
   Matrix(size_t n, size_t m, bool transpose,
-         std::shared_ptr<std::vector<double>> data) {
+         std::shared_ptr<std::vector<T>> data) {
     this->n_ = n;
     this->m_ = m;
     this->transpose_ = transpose;
@@ -57,7 +58,7 @@ class Matrix {
 
   Matrix Transpose() { return Matrix(m_, n_, !transpose_, data_); }
 
-  double operator()(size_t i, size_t j) const {
+  T operator()(size_t i, size_t j) const {
     size_t idx;
     if (transpose_) {
       idx = j * n_ + i;
@@ -69,7 +70,7 @@ class Matrix {
     return (*data_)[idx];
   }
 
-  double& operator()(size_t i, size_t j) {
+  T& operator()(size_t i, size_t j) {
     size_t idx;
     if (transpose_) {
       idx = j * n_ + i;
@@ -81,7 +82,7 @@ class Matrix {
     return (*data_)[idx];
   }
 
-  Matrix operator+(const Matrix& other) {
+  Matrix<T> operator+(const Matrix<T>& other) {
     CHECK_EQ(n_, other.n_);
     CHECK_EQ(m_, other.m_);
     Matrix ret = Matrix(n_, m_);
@@ -98,7 +99,7 @@ class Matrix {
 
     for (size_t i = 0; i < n_; ++i) {
       for (size_t j = 0; j < m_; ++j) {
-        absl::StrAppend(&str, operator()(i, j), " ");
+        absl::StrAppend(&str, operator()(i, j).ToString(), " ");
       }
       absl::StrAppend(&str, "\n");
     }
@@ -109,7 +110,7 @@ class Matrix {
   size_t n_;
   size_t m_;
   bool transpose_;
-  std::shared_ptr<std::vector<double>> data_;
+  std::shared_ptr<std::vector<T>> data_;
 };
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 16c9e0ce37b1c8..449a86db314cc3 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <ostream>
 #include <queue>
 #include <stack>
 #include <string>
@@ -31,6 +32,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -44,9 +46,11 @@ limitations under the License.
 #include "xla/printer.h"
 #include "xla/service/mapped_ptr_container_sorter.h"
 #include "xla/service/name_uniquer.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
+#include "tsl/lib/gtl/iterator_range.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
@@ -55,7 +59,22 @@ namespace xla {
 
 using absl::StrCat;
 
-enum VisitState { kNew = 0, kVisiting = 1, kVisited = 2 };
+enum class VisitState { kNew = 0, kVisiting = 1, kVisited = 2 };
+
+static std::ostream& operator<<(std::ostream& os, const VisitState& state) {
+  switch (state) {
+    case VisitState::kNew:
+      os << "new";
+      break;
+    case VisitState::kVisiting:
+      os << "visiting";
+      break;
+    case VisitState::kVisited:
+      os << "visited";
+      break;
+  }
+  return os;
+}
 
 class HloComputation::VisitMap {
  public:
@@ -506,24 +525,32 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     absl::FunctionRef<void(HloInstruction*)> func, HloInstruction* root,
     const ChannelDependencies& channel_dependencies, VisitMap& visited,
     std::vector<HloInstruction*>* dfs_stack_scratch) const {
+  bool has_channel_dependencies = !channel_dependencies.empty();
   auto* dfs_stack = dfs_stack_scratch;
   dfs_stack->clear();
-  dfs_stack->push_back(root);
+
+  // Pushes instruction to dfs stack only if it was not already processed.
+  auto dfs_stack_push = [&](HloInstruction* instr) {
+    VisitState state = visited.GetState(instr->index_in_parent_);
+    if (state != VisitState::kVisited) dfs_stack->push_back(instr);
+  };
+
+  dfs_stack_push(root);
   while (!dfs_stack->empty()) {
-    HloInstruction& current = *dfs_stack->back();
+    HloInstruction* current = dfs_stack->back();
+    DCHECK_EQ(current->parent(), this)
+        << "Instruction " << current->name()
+        << " is not in the current computation (" << name() << ").";
 
-    VisitMap::Handle h = current.index_in_parent_;
+    VisitMap::Handle h = current->index_in_parent_;
     VisitState state = visited.GetState(h);
-    if (state == kNew) {
-      visited.SetState(h, kVisiting);
+    if (state == VisitState::kNew) {
+      visited.SetState(h, VisitState::kVisiting);
     } else {
       dfs_stack->pop_back();
-      if (state != kVisited) {
-        DCHECK_EQ(current.parent(), this)
-            << "Instruction " << current.name()
-            << " is not in the current computation (" << name() << ").";
-        func(&current);
-        visited.SetState(h, kVisited);
+      if (state != VisitState::kVisited) {
+        visited.SetState(h, VisitState::kVisited);
+        func(current);
       }
       continue;
     }
@@ -532,34 +559,22 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     // Collectives with the same channel ID must be performed together, as these
     // represent MPMD-partitioned that will later be split into separate modules
     // and the order must be preserved.
-    if (&current != root) {
-      auto it = channel_dependencies.find(&current);
+    if (has_channel_dependencies && current != root) {
+      auto it = channel_dependencies.find(current);
       if (it != channel_dependencies.end()) {
-        dfs_stack->insert(dfs_stack->end(), it->second.begin(),
-                          it->second.end());
+        absl::c_for_each(it->second, dfs_stack_push);
       }
     }
 
     // Add the operands to the stack in reverse order so the first operand is
     // processed first. This will produce a more natural ordering and a nicer
     // result for things like HLO stringification.
-    const HloInstruction::InstructionVector& operands = current.operands();
+    const HloInstruction::InstructionVector& operands = current->operands();
+    absl::c_for_each(tsl::gtl::make_range(operands.rbegin(), operands.rend()),
+                     dfs_stack_push);
 
-    for (auto it = operands.rbegin(); it != operands.rend(); ++it) {
-      HloInstruction* operand = *it;
-      if (visited.GetState(operand->index_in_parent_) != kVisited) {
-        dfs_stack->push_back(operand);
-      } else {
-        // Already fully visited, so we avoid pushing onto the stack
-      }
-    }
-
-    const PtrVec<HloInstruction*>& predecessors =
-        current.control_predecessors();
-    if (!predecessors.empty()) {
-      dfs_stack->insert(dfs_stack->end(), predecessors.begin(),
-                        predecessors.end());
-    }
+    // Add control predecessors to the stack.
+    absl::c_for_each(current->control_predecessors(), dfs_stack_push);
   }
 }
 
@@ -1569,16 +1584,16 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
       auto it = visited.find(cur);
       if (it != visited.end()) {
         dfs_stack.pop_back();
-        if (it->second == kVisited) {
+        if (it->second == VisitState::kVisited) {
           continue;
         }
-        CHECK_EQ(it->second, kVisiting);
+        CHECK_EQ(it->second, VisitState::kVisiting);
         postorder.push_back(cur);
-        it->second = kVisited;
+        it->second = VisitState::kVisited;
         continue;
       }
 
-      visited.insert({cur, kVisiting});
+      visited.insert({cur, VisitState::kVisiting});
       for (HloInstruction* operand : cur->operands()) {
         const HloInstruction* new_operand = replace(operand);
         if (new_operand) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
index 2e6bd0e8495369..c831f31cec03f1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_dfs_reachability.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -45,16 +45,20 @@ bool HloDfsReachability::IsReachable(const HloInstruction* from,
 
   // Note that the DFS goes from the "uses" root towards the "defs", i.e. from
   // `to` node to `from` node, so the node indices are decreasing.
-  if (target_node_idx > dfs_root_idx) {
+  if (dfs_root_idx < target_node_idx) {
     return false;
   }
 
-  // We use LLVM support library here because it has stack-allocated maps (in
-  // contrast to absl) which significantly improves performance by avoiding heap
-  // allocations when instructions are reachable via a short chain.
-  llvm::SmallDenseSet<size_t, 8> visited_idxs{dfs_root_idx};
+  // We use LLVM support library here because it has stack-allocated bit vector
+  // which significantly improves performance by avoiding heap allocations when
+  // instructions are reachable via a short chain.
   llvm::SmallVector<const HloInstruction*> stack{to};
 
+  // We will visit instructions in the [target_node_idx, dfs_root_idx] range, so
+  // we can construct a smaller bit vector.
+  llvm::BitVector visited_idxs(1 + (dfs_root_idx - target_node_idx));
+  visited_idxs.set(dfs_root_idx - target_node_idx);
+
   auto check_and_enqueue = [&](const HloInstruction* instr) {
     if (instr == from) {
       return true;
@@ -63,9 +67,11 @@ bool HloDfsReachability::IsReachable(const HloInstruction* from,
     if (instr_idx < target_node_idx) {
       return false;
     }
-    if (auto [_, inserted] = visited_idxs.insert(instr_idx); !inserted) {
+    size_t visited_idx = instr_idx - target_node_idx;
+    if (visited_idxs.test(visited_idx)) {
       return false;
     }
+    visited_idxs.set(visited_idx);
     stack.push_back(instr);
     return false;
   };
@@ -90,10 +96,11 @@ std::unique_ptr<HloDfsReachability> HloDfsReachability::Build(
     const HloComputation* computation) {
   auto res = std::make_unique<HloDfsReachability>();
 
-  HloComputation::ChannelDependencies channel_dependencies =
-      computation->ComputeChannelDependencies();
+  // For instruction reachability we do not care about correct order of
+  // collective operations as we only care about use-def chains.
+  HloComputation::ChannelDependencies empty_channel_dependencies;
   std::vector<HloInstruction*> instructions =
-      computation->MakeInstructionPostOrder(channel_dependencies);
+      computation->MakeInstructionPostOrder(empty_channel_dependencies);
 
   res->instruction_to_idx_.reserve(instructions.size());
   for (size_t i = 0; i < instructions.size(); ++i) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index c1229e46491ab6..50c12af36a452d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -2858,14 +2858,8 @@ using ConstHloInstructionSet =
 
 template <HloOpcode op, HloOpcode... rest>
 bool HloPredicateIsOp(const HloInstruction* instruction) {
-  if (instruction->opcode() == op) {
-    return true;
-  }
-  if constexpr (sizeof...(rest) == 0) {
-    return false;
-  } else {
-    return HloPredicateIsOp<rest...>(instruction);
-  }
+  return (instruction->opcode() == op) ||
+         ((instruction->opcode() == rest) || ...);
 }
 
 /* static */ inline bool HloInstruction::MightHaveCalledComputations(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 5399190bb5af97..0a2364355dbd4c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -2215,6 +2215,8 @@ class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
 
   int64_t first_index_operand_number() const override { return 2; }
 
+  const HloInstruction* update() const { return operand(1); }
+
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kDynamicUpdateSlice;
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index e7a6edbd193843..28610dac086498 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -700,16 +700,12 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
         execution_options->use_spmd_partitioning());
     module_config.set_use_auto_spmd_partitioning(
         execution_options->use_auto_spmd_partitioning());
-    std::vector<int64_t> mesh_shape;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_shape()) {
-      mesh_shape.push_back(t);
-    }
-    module_config.set_auto_spmd_partitioning_mesh_shape(mesh_shape);
-    std::vector<int64_t> mesh_ids;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_ids()) {
-      mesh_ids.push_back(t);
-    }
-    module_config.set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+    module_config.set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_shape().begin(),
+        execution_options->auto_spmd_partitioning_mesh_shape().end()));
+    module_config.set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_ids().begin(),
+        execution_options->auto_spmd_partitioning_mesh_ids().end()));
     module_config.set_deduplicate_hlo(execution_options->deduplicate_hlo());
     if (!execution_options->allow_spmd_sharding_propagation_to_parameters()
              .empty()) {
@@ -735,11 +731,10 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
                  module_config.num_partitions());
       }
     }
-    std::vector<bool> param_requires_broadcast_via_collectives(
+    module_config.set_param_requires_broadcast_via_collectives(std::vector<
+                                                               bool>(
         execution_options->param_requires_broadcast_via_collectives().begin(),
-        execution_options->param_requires_broadcast_via_collectives().end());
-    module_config.set_param_requires_broadcast_via_collectives(
-        param_requires_broadcast_via_collectives);
+        execution_options->param_requires_broadcast_via_collectives().end()));
     module_config.set_allow_separate_sharding_programs(
         execution_options->allow_separate_sharding_programs());
     HloModuleConfig::AssignStructShardableValueUpdatePairs(
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 6311929ac533a5..0f6d449127e68d 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -505,6 +505,9 @@ class HloModule {
   const HloInputOutputAliasConfig& input_output_alias_config() const {
     return input_output_alias_config_;
   }
+  void set_input_output_alias_config(HloInputOutputAliasConfig config) {
+    input_output_alias_config_ = std::move(config);
+  }
 
   // buffer_donor_config_ indicates the set of input buffer donors that are
   // expected from the module.
@@ -512,6 +515,9 @@ class HloModule {
   const HloBufferDonorConfig& buffer_donor_config() const {
     return buffer_donor_config_;
   }
+  void set_buffer_donor_config(HloBufferDonorConfig config) {
+    buffer_donor_config_ = std::move(config);
+  }
 
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 16b4c0cf384094..2f778ccfd80c93 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Implementation of XLA’s HLO transformations.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index cec66c2bd7be4f..98913c1cef2d4b 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   Implementation of XLA’s HLO utilities used for higher-level transformations.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 0661c620c4d5f5..0177abc48f506f 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -3063,13 +3063,52 @@ Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape) {
   if (sharding.IsTileMaximal() || sharding.IsManual() || sharding.IsUnknown()) {
     return shape;
   }
+  if (!shape.IsArray()) {
+    return shape;
+  }
   Shape result_shape = shape;
-  for (int64_t i = 0; i < sharding.TiledDataRank(); ++i) {
+  // sharding.TiledDataRank() == i < shape.rank() is not always true?
+  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
     result_shape.set_dimensions(
         i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
   }
   return result_shape;
 }
 
+Shape TileShape(const HloSharding& sharding, const Shape& shape) {
+  if (!sharding.IsTuple()) {
+    return TileLeafShape(sharding, shape);
+  }
+  Shape result_shape = shape;
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape,
+      [&shape, &sharding](Shape* subshape, const ShapeIndex& index) {
+        if (!ShapeUtil::IsLeafIndex(shape, index)) {
+          return;
+        }
+        const HloSharding& subshape_sharding =
+            sharding.GetSubSharding(shape, index);
+        *subshape = TileLeafShape(subshape_sharding, *subshape);
+      });
+
+  return result_shape;
+}
+
+Shape TileLeafShape(const HloSharding& sharding, const Shape& shape) {
+  if (sharding.IsTileMaximal() || sharding.IsManual() || sharding.IsUnknown()) {
+    return shape;
+  }
+  if (!shape.IsArray()) {
+    return shape;
+  }
+  Shape result_shape = shape;
+  for (int64_t i = 0; i < sharding.TiledDataRank() && i < shape.rank(); ++i) {
+    CHECK_EQ(shape.dimensions(i) % sharding.tile_assignment().dim(i), 0);
+    result_shape.set_dimensions(
+        i, shape.dimensions(i) / sharding.tile_assignment().dim(i));
+  }
+  return result_shape;
+}
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index bbf074c408a4a7..8671ebfe2554f2 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -479,6 +479,13 @@ Shape UntileShape(const HloSharding& sharding, const Shape& shape);
 // REQUIRES: !sharding.IsTuple()
 Shape UntileLeafShape(const HloSharding& sharding, const Shape& shape);
 
+// Returns the tiled shape.
+Shape TileShape(const HloSharding& sharding, const Shape& shape);
+
+// Returns the tiled shape.
+// REQUIRES: !sharding.IsTuple()
+Shape TileLeafShape(const HloSharding& sharding, const Shape& shape);
+
 }  // namespace hlo_sharding_util
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index bcf9d1690cda3c..ad042361a7bf27 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -823,6 +823,37 @@ TEST(HloShardingUtilTest, IsSortOperandShardingMovableSortDimUnsharded) {
   iota.set_sharding(HloSharding::IotaTile({1, 2}));
   EXPECT_FALSE(IsSortOperandShardingMovable(&iota, 0));
 }
+
+TEST(HloShardingUtilTest, TileShape) {
+  HloSharding sharding = HloSharding::Tile(TileAssignment({4, 1}));
+  Shape shape_0 = ShapeUtil::MakeShape(F32, {80, 128});
+  auto tile_shape_0 = hlo_sharding_util::TileShape(sharding, shape_0);
+  auto expected_shape_0 = ShapeUtil::MakeShape(F32, {20, 128});
+  EXPECT_EQ(tile_shape_0, expected_shape_0);
+  Shape shape_1 = ShapeUtil::MakeShape(F32, {40, 128});
+  auto tile_shape_1 = hlo_sharding_util::TileShape(sharding, shape_1);
+  auto expected_shape_1 = ShapeUtil::MakeShape(F32, {10, 128});
+  EXPECT_EQ(tile_shape_1, expected_shape_1);
+  const Shape tuple = ShapeUtil::MakeTupleShape({tile_shape_0, tile_shape_1});
+  EXPECT_EQ(hlo_sharding_util::TileShape(sharding, tuple),
+            ShapeUtil::MakeTupleShape({expected_shape_0, expected_shape_1}));
+}
+
+TEST(HloShardingUtilTest, UntileShape) {
+  HloSharding sharding = HloSharding::Tile(TileAssignment({4, 1}));
+  Shape shape_0 = ShapeUtil::MakeShape(F32, {80, 128});
+  auto tile_shape_0 = hlo_sharding_util::UntileShape(sharding, shape_0);
+  auto expected_shape_0 = ShapeUtil::MakeShape(F32, {320, 128});
+  EXPECT_EQ(tile_shape_0, expected_shape_0);
+  Shape shape_1 = ShapeUtil::MakeShape(F32, {40, 128});
+  auto tile_shape_1 = hlo_sharding_util::UntileShape(sharding, shape_1);
+  auto expected_shape_1 = ShapeUtil::MakeShape(F32, {160, 128});
+  EXPECT_EQ(tile_shape_1, expected_shape_1);
+  const Shape tuple = ShapeUtil::MakeTupleShape({tile_shape_0, tile_shape_1});
+  EXPECT_EQ(hlo_sharding_util::UntileShape(sharding, tuple),
+            ShapeUtil::MakeTupleShape({expected_shape_0, expected_shape_1}));
+}
+
 }  // namespace
 }  // namespace hlo_sharding_util
 }  // namespace xla
diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 7c1ae28e8a9d58..1f3768c1cc1a2b 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/byte_swap_array.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -59,7 +60,6 @@ limitations under the License.
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/byte_swap_array.h"
 
 namespace xla {
 namespace {
@@ -249,6 +249,21 @@ Literal::Literal() : Literal(NilShape()) {}
 Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
+void Literal::SetShape(const Shape& shape) {
+  Shape shape_storage;
+  const Shape* shape_ptr = &shape;
+  if (LayoutUtil::HasCustomElementSizeInBits(shape)) {
+    shape_storage = shape;
+    shape_storage.mutable_layout()->set_element_size_in_bits(0);
+    shape_ptr = &shape_storage;
+  }
+  if (const Shape* intered_shape_ptr = TryInternShape(*shape_ptr)) {
+    shape_ = intered_shape_ptr;
+  } else {
+    shape_ = std::make_unique<Shape>(*shape_ptr);
+  }
+}
+
 void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays,
                        ArrayValueState leaf_array_value_state) {
   if (shape.IsTuple()) {
@@ -276,16 +291,9 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays,
 Literal::Literal(const Shape& shape, bool allocate_arrays,
                  ArrayValueState leaf_array_value_state)
     : MutableLiteralBase() {
-  if (const Shape* intered_shape_ptr = TryInternShape(shape)) {
-    shape_ = intered_shape_ptr;
-  } else {
-    shape_ = std::make_unique<Shape>(shape);
-  }
+  SetShape(shape);
   CHECK(leaf_array_value_state != ArrayValueState::kKnown ||
         LayoutUtil::HasLayout(*shape_));
-  // Currently we do nibble packing/unpacking in TPU host/device transfer.
-  CHECK(!LayoutUtil::HasCustomElementSizeInBits(*shape_))
-      << "Literal does not support layouts with custom bit size: " << *shape_;
   root_piece_.set_subshape(shape_.get());
   CHECK(&root_piece_.subshape() == shape_.get());
 
@@ -2701,107 +2709,4 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
   }
 }
 
-BorrowingLiteral::BorrowingLiteral(const LiteralProto& proto)
-    : LiteralBase(), shape_(std::make_unique<Shape>(proto.shape())) {
-  root_piece_ = Piece();
-  root_piece_.set_subshape(shape_.get());
-
-  if (shape().IsArray()) {
-    absl::Span<const char> data;
-    switch (shape_->element_type()) {
-#define BORROWING_LITERAL_CAST_DATA_(FIELD)                                   \
-  absl::Span<const char>(reinterpret_cast<const char*>(proto.FIELD().data()), \
-                         proto.FIELD().size() * sizeof *proto.FIELD().data())
-      case PRED:
-        data = BORROWING_LITERAL_CAST_DATA_(preds);
-        break;
-      case S4:
-        data = proto.s4s();
-        break;
-      case S8:
-        data = proto.s8s();
-        break;
-      case S16:
-        data = proto.s16s();
-        break;
-      case S32:
-        data = BORROWING_LITERAL_CAST_DATA_(s32s);
-        break;
-      case S64:
-        data = BORROWING_LITERAL_CAST_DATA_(s64s);
-        break;
-      case U4:
-        data = proto.u4s();
-        break;
-      case U8:
-        data = proto.u8s();
-        break;
-      case U16:
-        data = proto.u16s();
-        break;
-      case U32:
-        data = BORROWING_LITERAL_CAST_DATA_(u32s);
-        break;
-      case U64:
-        data = BORROWING_LITERAL_CAST_DATA_(u64s);
-        break;
-      case F16:
-        data = proto.f16s();
-        break;
-      case F32:
-        data = BORROWING_LITERAL_CAST_DATA_(f32s);
-        break;
-      case BF16:
-        data = proto.bf16s();
-        break;
-      case F64:
-        data = BORROWING_LITERAL_CAST_DATA_(f64s);
-        break;
-      case F8E5M2:
-        data = proto.f8e5m2s();
-        break;
-      case F8E4M3FN:
-        data = proto.f8e4m3fns();
-        break;
-      case F8E4M3B11FNUZ:
-        data = proto.f8e4m3b11fnuzs();
-        break;
-      case F8E5M2FNUZ:
-        data = proto.f8e5m2fnuzs();
-        break;
-      case F8E4M3FNUZ:
-        data = proto.f8e4m3fnuzs();
-        break;
-      case C64:
-        data = BORROWING_LITERAL_CAST_DATA_(c64s);
-        break;
-      case C128:
-        data = BORROWING_LITERAL_CAST_DATA_(c128s);
-        break;
-#undef BORROWING_LITERAL_CAST_DATA_
-      default:
-        LOG(FATAL) << "Invalid element type for array: " << shape();
-    }
-    CHECK_EQ(data.size(), ShapeUtil::ByteSizeOfElements(*shape_));
-    root_piece_.set_buffer(const_cast<char*>(data.data()));
-  } else if (shape_->IsTuple()) {
-    CHECK_EQ(shape().tuple_shapes_size(), proto.tuple_literals_size());
-    BuildPieceSubtree(*shape_, &root_piece_);
-    for (int i = 0; i < shape_->tuple_shapes_size(); ++i) {
-      BorrowingLiteral child(proto.tuple_literals(i));
-      child.root_piece_.ForEachMutableSubpiece(
-          [&](const ShapeIndex& child_index, Piece* child_piece) {
-            if (!child_piece->subshape().IsArray()) {
-              return;
-            }
-            ShapeIndex index = {i};
-            index.insert(index.end(), child_index.begin(), child_index.end());
-            root_piece_.child(index).set_buffer(child_piece->buffer());
-          });
-    }
-  } else {
-    LOG(FATAL) << "Invalid shape: " << *shape_;
-  }
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/literal.h b/third_party/xla/xla/literal.h
index 8f8894dbc26ea8..2ebe0c2d727174 100644
--- a/third_party/xla/xla/literal.h
+++ b/third_party/xla/xla/literal.h
@@ -35,7 +35,9 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/base/casts.h"
 #include "absl/base/config.h"
+#include "absl/base/optimization.h"
 #include "absl/functional/function_ref.h"
+#include "absl/hash/hash.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -353,6 +355,20 @@ class LiteralBase {
     return LiteralBase::Hash(std::move(state), value);
   }
 
+ private:
+  // With C++20, we can use `requires { absl::Hash<NativeT>(); }`.
+  template <typename T>
+  static constexpr bool IsAbslHashable() {
+#ifdef _MSC_VER
+    // `std::is_invocable_v<absl::Hash<T>, T>` doesn't work on MSVC.
+    // See https://godbolt.org/z/Wj9d7zrav.
+    return std::is_arithmetic_v<T>;
+#else
+    return std::is_invocable_v<absl::Hash<T>, T>;
+#endif
+  }
+
+ public:
   template <typename H, bool kIsLayoutSensitive = true,
             int64_t kByteLimit = std::numeric_limits<int64_t>::max()>
   static H Hash(H state, const LiteralBase& literal) {
@@ -366,10 +382,36 @@ class LiteralBase {
           }
 
           CHECK(LayoutUtil::IsDenseArray(subshape));
-          auto data = absl::MakeConstSpan(
-              static_cast<const char*>(literal.untyped_data(index)),
-              std::min(kByteLimit, literal.size_bytes(index)));
-          state = H::combine(std::move(state), data);
+          const auto hash_func = [&](auto primitive_type_constant) {
+            using NativeT =
+                primitive_util::NativeTypeOf<primitive_type_constant>;
+            // If we can hash NativeT, then do so. Otherwise, hash raw buffer
+            // data taking care to avoid invalid parts of 4-bit type data.
+            if constexpr (IsAbslHashable<NativeT>()) {
+              state = H::combine(std::move(state),
+                                 literal.piece(index).data<NativeT>());
+            } else {
+              const int64_t num_bytes =
+                  std::min(kByteLimit, literal.size_bytes(index));
+              const char* buffer =
+                  static_cast<const char*>(literal.untyped_data(index));
+              if (primitive_util::Is4BitType(subshape.element_type())) {
+                // Note: in this case, we could potentially read 8 bytes at a
+                // time, mask out the upper 4 bits of each byte, and then hash 8
+                // bytes, but it adds complexity and needs special handling for
+                // the non-divisible-by-8 leftover bytes.
+                for (int64_t i = 0; i < num_bytes; ++i) {
+                  state =
+                      H::combine(std::move(state), buffer[i] & uint8_t{0xf});
+                }
+              } else {
+                auto data = absl::MakeConstSpan(buffer, num_bytes);
+                state = H::combine(std::move(state), data);
+              }
+            }
+          };
+          primitive_util::ArrayTypeSwitch<void>(hash_func,
+                                                subshape.element_type());
         });
 
     return std::move(state);
@@ -870,18 +912,6 @@ class LiteralBase {
       return tuple_rep->children[index];
     }
 
-    Piece& child(ShapeIndexView index) {
-      return const_cast<Piece&>(const_cast<const Piece*>(this)->child(index));
-    }
-    const Piece& child(ShapeIndexView index) const {
-      const Piece* result = this;
-      while (!index.empty()) {
-        result = &result->child(index.front());
-        index.remove_prefix(1);
-      }
-      return *result;
-    }
-
     // Adds a child piece to this piece's children.
     void emplace_back(Piece child_piece) {
       auto* tuple_rep = GetTupleRep();
@@ -1469,6 +1499,11 @@ class Literal : public MutableLiteralBase {
   // Deallocate the buffers held by this literal.
   void DeallocateBuffers();
 
+  // Sets the shape_ field from a Shape. shape_'s element_size_in_bits field
+  // on the layout is always set to 0 since Literals do not support packed
+  // subbyte elements.
+  void SetShape(const Shape& shape);
+
   // Recursively sets the subshapes and buffers of all subpieces rooted at
   // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
   // the shape.
@@ -1543,10 +1578,6 @@ class BorrowingLiteral : public LiteralBase {
                    const Shape& shape);
   // TODO(b/79707221): adding constructors for nested tuples as well.
 
-  // Construct a BorrowingLiteral from a LiteralProto.  The proto must not be
-  // modified during the lifetime of the BorrowingLiteral.
-  explicit BorrowingLiteral(const LiteralProto& proto);
-
  private:
   // Recursively builds the subtree for the given piece and sets the subshapes
   // of the given piece with the given shape.
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index e61ba78d95e2ba..24c12bb92d6d84 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -2197,32 +2197,30 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   auto nested_tuple =
       LiteralUtil::MakeTuple({&tuple, &vector_bfloat16, &tuple, &nil_literal});
 
-  auto test_proto = [](const Literal& literal) {
-    LiteralProto proto = literal.ToProto();
-    EXPECT_EQ(literal, Literal::CreateFromProto(proto).value());
-    EXPECT_EQ(literal, BorrowingLiteral(proto));
+  auto to_from_proto = [](const Literal& literal) -> Literal {
+    return Literal::CreateFromProto(literal.ToProto()).value();
   };
 
-  test_proto(one_f32);
-  test_proto(vector_int8);
-  test_proto(vector_uint8);
-  test_proto(vector_c64);
-  test_proto(vector_c128);
-  test_proto(vector_bfloat16);
-  test_proto(vector_f8e5m2);
-  test_proto(vector_f8e4m3);
-  test_proto(vector_f8e4m3b11);
-  test_proto(vector_f8e5m2fnuz);
-  test_proto(vector_f8e4m3fnuz);
-  test_proto(matrix_pred);
-  test_proto(vector_s4);
-  test_proto(vector_u4);
-  test_proto(tuple);
-  test_proto(nested_tuple);
-  test_proto(nil_literal);
+  EXPECT_EQ(one_f32, to_from_proto(one_f32));
+  EXPECT_EQ(vector_int8, to_from_proto(vector_int8));
+  EXPECT_EQ(vector_uint8, to_from_proto(vector_uint8));
+  EXPECT_EQ(vector_c64, to_from_proto(vector_c64));
+  EXPECT_EQ(vector_c128, to_from_proto(vector_c128));
+  EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
+  EXPECT_EQ(vector_f8e5m2, to_from_proto(vector_f8e5m2));
+  EXPECT_EQ(vector_f8e4m3, to_from_proto(vector_f8e4m3));
+  EXPECT_EQ(vector_f8e4m3b11, to_from_proto(vector_f8e4m3b11));
+  EXPECT_EQ(vector_f8e5m2fnuz, to_from_proto(vector_f8e5m2fnuz));
+  EXPECT_EQ(vector_f8e4m3fnuz, to_from_proto(vector_f8e4m3fnuz));
+  EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
+  EXPECT_EQ(vector_s4, to_from_proto(vector_s4));
+  EXPECT_EQ(vector_u4, to_from_proto(vector_u4));
+  EXPECT_EQ(tuple, to_from_proto(tuple));
+  EXPECT_EQ(nested_tuple, to_from_proto(nested_tuple));
+  EXPECT_EQ(nil_literal, to_from_proto(nil_literal));
 
   EXPECT_NE(one_f32, two_f32);
-  EXPECT_NE(one_f32, Literal::CreateFromProto(two_f32.ToProto()).value());
+  EXPECT_NE(one_f32, to_from_proto(two_f32));
 }
 
 TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
index 9fb60b8442c698..ff81d082104b3a 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
@@ -44,7 +44,6 @@ cc_library(
         "//xla/mlir/runtime/utils:custom_calls",
         "//xla/mlir/xla_cpu/ir:xla_cpu",
         "//xla/mlir_hlo",
-        "//xla/mlir_hlo:lhlo",
         "//xla/service:hlo_parser",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
index 16223cead19f7c..fb3bb71548c6f9 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/mlir/runtime/transforms/type_converter.h"
 #include "xla/mlir/runtime/utils/custom_calls.h"
 #include "xla/mlir/xla_cpu/ir/xla_cpu.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/service/hlo_parser.h"
 
@@ -48,8 +47,6 @@ namespace {
 
 using namespace mlir;  // NOLINT
 
-using mlir::lmhlo::CustomCallOp;
-
 using xla_cpu::PartitionIdOp;
 using xla_cpu::ReplicaIdOp;
 
@@ -115,133 +112,6 @@ func::CallOp CreateCallForDpsCollectiveOp(Operation* op,
 
 //===----------------------------------------------------------------------===//
 
-class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.cpu.custom_call";
-
- public:
-  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  // Rewrite custom call with `API_VERSION_TYPED_FFI` version into XLA runtime
-  // custom calls bypassing custom call adaptor.
-  LogicalResult rewriteTypedCustomCall(CustomCallOp op,
-                                       PatternRewriter& rewriter) const {
-    // TODO(ezhulenev): Support target arg mapping, or explain why we do not
-    // need them for typed custom calls.
-    if (op.getTargetArgMapping())
-      return op.emitOpError(
-          "API_VERSION_TYPED_FFI custom calls do not "
-          "support target arg mapping");
-
-    // Create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
-    callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
-
-    // Forward backend config to the custom call implementation.
-    auto config = op.getBackendConfig();
-    if (!config) return op.emitOpError("Failed to get backend config");
-    auto dict = config->cast<mlir::DictionaryAttr>();
-    llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
-
-    // Call the custom call function forwarding user-defined attributes.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, backend_config);
-
-    return success();
-  }
-
-  LogicalResult matchAndRewrite(CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    // Typed custom calls lowered directly to XLA runtime custom calls.
-    if (op.getApiVersion() == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
-      return rewriteTypedCustomCall(op, rewriter);
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // By default all operands passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op.getOperands();
-
-    // Get the number of outputs from operand_segment_sizes.
-    int64_t num_results = op->getAttrOfType<DenseI32ArrayAttr>(
-        op.getOperandSegmentSizesAttrName())[1];
-
-    // If custom call has target arguments mapping, then we need to pass empty
-    // memrefs in place of holes.
-    if (op.getTargetArgMapping().has_value()) {
-      auto mapping = *op.getTargetArgMapping();
-      int64_t num_args = mapping.getNumArgs();
-      num_results = mapping.getNumResults();
-
-      // Always create an `alloca` in the parent function entry block.
-      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-      Value hole = [&]() -> Value {
-        OpBuilder::InsertionGuard guard(b);
-        b.setInsertionPointToStart(
-            &op->getParentOfType<func::FuncOp>().front());
-        return b.create<memref::AllocaOp>(MemRefType::get({0}, b.getI8Type()));
-      }();
-
-      // We represent holes as empty i8 memrefs.
-      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
-
-      // Update operands to mapped custom call arguments.
-      auto args = mapping.getArgsToTargetArgs();
-      for (const auto& indexed : llvm::enumerate(args))
-        operands[indexed.value()] = op.getArgs()[indexed.index()];
-
-      // Update operands to mapped custom call results.
-      auto res = mapping.getResultsToTargetResults();
-      for (const auto& indexed : llvm::enumerate(res))
-        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
-    }
-
-    // TODO(jreiffers): This will break if an output has a non-default layout.
-    operands = EnsureFlatMemrefs(operands, b);
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    // The ABI is different depending on whether the original op was outputting
-    // a tuple or not. For multiple outputs this is trivial but for a single
-    // output we rely on the xla_shape attribute to distinguish the ABIs.
-    bool output_tuple = num_results > 1;
-    if (auto xla_shape = op->getAttrOfType<StringAttr>("xla_shape"))
-      output_tuple = ParseShape(xla_shape.strref())->IsTuple();
-
-    // This is not equivalent to op.getApiVersionAttr() - that call returns null
-    // if the attribute is absent. getApiVersion returns the default.
-    Attribute api_version =
-        mhlo::CustomCallApiVersionAttr::get(getContext(), op.getApiVersion());
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("num_results"),
-         b.getI32IntegerAttr(static_cast<int32_t>(num_results))},
-        {b.getStringAttr("output_tuple"), b.getBoolAttr(output_tuple)},
-        {b.getStringAttr("api_version"), api_version},
-        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
-
-    if (auto backend_config = op.getBackendConfigAttr()) {
-      custom_call_attrs.emplace_back(b.getStringAttr("backend_config"),
-                                     op.getBackendConfigAttr());
-    }
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
 template <typename IdOp>
 class IdOpLowering : public OpRewritePattern<IdOp> {
  public:
@@ -542,11 +412,10 @@ void ConvertXlaCpuToCpuRuntimePass::runOnOperation() {
 
   // Convert xla_cpu operations to XLA cpu runtime custom calls.
   RewritePatternSet patterns(ctx);
-  patterns
-      .insert<AllReduceLowering, AllToAllLowering, CollectivePermuteLowering,
-              ConvolutionLowering, CustomCallOpLowering, FftLowering,
-              InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
-          ctx, custom_calls);
+  patterns.insert<AllReduceLowering, AllToAllLowering,
+                  CollectivePermuteLowering, ConvolutionLowering, FftLowering,
+                  InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
+      ctx, custom_calls);
   patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
                                                custom_calls);
   patterns.insert<IdOpLowering<ReplicaIdOp>>(ctx, "xla.cpu.replica_id",
diff --git a/third_party/xla/xla/mlir/runtime/ir/BUILD b/third_party/xla/xla/mlir/runtime/ir/BUILD
index 50e28fcf876897..703d2f83a194b0 100644
--- a/third_party/xla/xla/mlir/runtime/ir/BUILD
+++ b/third_party/xla/xla/mlir/runtime/ir/BUILD
@@ -106,5 +106,6 @@ cc_library(
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:SideEffectInterfaces",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index 0af4d67c910f7c..fa29f9ec740de2 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -1,5 +1,4 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -10,6 +9,7 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -80,6 +80,7 @@ cc_library(
     deps = [
         "//xla/mlir/runtime/ir:rt",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -93,6 +94,7 @@ xla_cc_test(
         "//xla/mlir/runtime/ir:rt",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -170,6 +172,7 @@ cc_library(
     deps = [
         ":custom_call_encoding",
         "//xla/runtime:type_id",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/utils/BUILD b/third_party/xla/xla/mlir/runtime/utils/BUILD
index 08021de8b0da53..b45fcb04ffdbd8 100644
--- a/third_party/xla/xla/mlir/runtime/utils/BUILD
+++ b/third_party/xla/xla/mlir/runtime/utils/BUILD
@@ -17,8 +17,11 @@ cc_library(
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/functional:any_invocable",
         "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:mlir_async_runtime_api",
         "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
index f7bb04af20e0a3..2e7967a9e2ebad 100644
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.cc
@@ -25,11 +25,18 @@ limitations under the License.
 #include <utility>
 
 #include "absl/base/dynamic_annotations.h"
+#include "absl/functional/any_invocable.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "mlir/ExecutionEngine/AsyncRuntime.h"  // from @llvm-project
 #include "xla/runtime/async_runtime.h"
 #include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/chain.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/mem.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
index 486fbb185ad8db..94f3dcd2f46495 100644
--- a/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
+++ b/third_party/xla/xla/mlir/runtime/utils/async_runtime_api.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define XLA_MLIR_RUNTIME_UTILS_ASYNC_RUNTIME_API_H_
 
 #include "absl/functional/any_invocable.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/Mangling.h"
 #include "xla/runtime/async_runtime.h"
+#include "tsl/concurrency/async_value.h"
 #include "tsl/concurrency/async_value_ref.h"
 #include "tsl/concurrency/chain.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.cc b/third_party/xla/xla/mlir/runtime/utils/constraints.cc
index 6a38f6bf9f6d8f..da0023ea62b4de 100644
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/constraints.cc
@@ -21,8 +21,15 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "xla/runtime/constraints.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/utils/constraints.h b/third_party/xla/xla/mlir/runtime/utils/constraints.h
index bbecc4995347a2..f88c3f267e0d38 100644
--- a/third_party/xla/xla/mlir/runtime/utils/constraints.h
+++ b/third_party/xla/xla/mlir/runtime/utils/constraints.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define XLA_MLIR_RUNTIME_UTILS_CONSTRAINTS_H_
 
 #include "absl/status/statusor.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/FunctionInterfaces.h"  // from @llvm-project
 #include "xla/runtime/constraints.h"
 
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc b/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
index fec85d3d2c156f..41991bc8f50b14 100644
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
+++ b/third_party/xla/xla/mlir/runtime/utils/custom_calls.cc
@@ -15,12 +15,15 @@ limitations under the License.
 
 #include "xla/mlir/runtime/utils/custom_calls.h"
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h b/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
index 0c89f922350b79..02a85d2c38ff62 100644
--- a/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
+++ b/third_party/xla/xla/mlir/runtime/utils/custom_calls.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index ac80b27c79b810..cabd61a0d739a5 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
index 9cb3de61b2118e..bb09094c14dca8 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
+++ b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
@@ -104,6 +104,7 @@ cc_library(
         "//xla/mlir_hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
     ],
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 3c82352fa32ee3..dfed55f77e5fd3 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -306,6 +306,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -439,6 +440,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:Dialect",
@@ -446,6 +448,7 @@ cc_library(
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
@@ -488,6 +491,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:ViewLikeInterface",
         "@stablehlo//:stablehlo_type_inference",
     ],
@@ -572,7 +576,7 @@ cc_library(
     ],
     strip_include_prefix = ".",
     deps = [
-        ":chlo_legalize_to_hlo",
+        ":chlo_legalize_to_hlo_inc_gen",
         ":hlo_legalize_to_stablehlo",
         ":legalize_to_linalg_utils",
         ":legalize_to_standard_inc_gen",
@@ -594,6 +598,7 @@ cc_library(
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ControlFlowDialect",
@@ -617,10 +622,12 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
     ],
 )
 
@@ -635,6 +642,7 @@ cc_library(
         "@llvm-project//mlir:FuncTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -788,6 +796,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:chlo_ops",
     ],
@@ -875,32 +884,11 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],
 )
 
-cc_library(
-    name = "chlo_legalize_to_hlo",
-    srcs = ["mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc"],
-    hdrs = ["mhlo/transforms/rewriters.h"],
-    strip_include_prefix = ".",
-    deps = [
-        ":chlo_legalize_to_hlo_inc_gen",
-        ":map_chlo_to_hlo_op",
-        ":mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:ShapeDialect",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:Transforms",
-        "@stablehlo//:broadcast_utils",
-        "@stablehlo//:chlo_ops",
-    ],
-)
-
 gentbl_cc_library(
     name = "chlo_legalize_to_hlo_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -928,6 +916,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_ops_inc_gen",
@@ -947,6 +936,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_ops_inc_gen",
@@ -970,7 +960,6 @@ cc_library(
     ],
     strip_include_prefix = ".",
     deps = [
-        ":chlo_legalize_to_hlo",
         ":deallocation_passes",
         ":deallocation_passes_inc_gen",
         ":lhlo",
@@ -1032,6 +1021,7 @@ cc_library(
         "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
@@ -1073,6 +1063,7 @@ cc_library(
         "@llvm-project//mlir:TensorTransforms",
         "@llvm-project//mlir:TensorUtils",
         "@llvm-project//mlir:TransformDialect",
+        "@llvm-project//mlir:TransformDialectInterfaces",
         "@llvm-project//mlir:TransformDialectTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
@@ -1114,6 +1105,7 @@ cc_library(
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUCommonTransforms",
         "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUToNVVMTransforms",
         "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:GPUTransforms",
@@ -1133,6 +1125,7 @@ cc_library(
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:ShapeToStandard",
         "@llvm-project//mlir:TensorInferTypeOpInterfaceImpl",
+        "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorDialect",
         "@llvm-project//mlir:VectorToLLVM",
diff --git a/third_party/xla/xla/mlir_hlo/WORKSPACE b/third_party/xla/xla/mlir_hlo/WORKSPACE
index 7078b6cc7bd6c4..bab4b26e9a648d 100644
--- a/third_party/xla/xla/mlir_hlo/WORKSPACE
+++ b/third_party/xla/xla/mlir_hlo/WORKSPACE
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Workspace for MLIR HLO."""
+# buildifier: disable=load-on-top
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index ba8ab2e65ad68e..b7d447620a22b7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -192,7 +192,6 @@ add_mlir_library(MhloToStandard
 )
 
 add_mlir_library(ChloPasses
-  chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
   chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
 
   DEPENDS
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
deleted file mode 100644
index 958d137a41214d..00000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+++ /dev/null
@@ -1,1976 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Enable the use of M_* math constants.
-// NOTE: this must be first in the file to ensure that if cmath is transitively
-// included by any other header it has the define set on first processing.
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants
-#define _USE_MATH_DEFINES
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/map_chlo_to_hlo_op.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OperationSupport.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "stablehlo/dialect/BroadcastUtils.h"
-#include "stablehlo/dialect/ChloOps.h"
-#include "utils/hlo_utils.h"
-
-namespace mlir {
-namespace chlo {
-namespace {
-
-struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
-  using OpConversionPattern<ConstantLikeOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ConstantLikeOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto resultTy = op.getType().cast<ShapedType>();
-
-    // Unranked uses are not supported.
-    if (!resultTy.hasRank()) return failure();
-
-    // Lower to MHLO constant if statically shaped.
-    if (resultTy.hasStaticShape()) {
-      auto complexAttr = op.getValue().dyn_cast<complex::NumberAttr>();
-      auto attr = complexAttr
-                      ? DenseElementsAttr::get(resultTy, complexAttr.getValue())
-                      : DenseElementsAttr::get(resultTy, op.getValue());
-      rewriter.replaceOpWithNewOp<mhlo::ConstantOp>(op, attr);
-      return success();
-    }
-
-    // Lower to broadcasted constant.
-    auto loc = op.getLoc();
-    Value constant = rewriter.create<mhlo::ConstantOp>(loc, op.getValue());
-    Value shape = rewriter.create<shape::ShapeOfOp>(loc, adaptor.getOperand());
-    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
-        op, resultTy, constant, shape, rewriter.getI64TensorAttr({}));
-    return success();
-  }
-};
-
-template <typename FTy>
-Value materializeChebyshevPolynomialApproximation(
-    ConversionPatternRewriter &rewriter, Location loc, Value x,
-    ArrayRef<FTy> coefficients) {
-  Value b0 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value b1 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value b2 = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  for (FTy c : coefficients) {
-    b2 = b1;
-    b1 = b0;
-    b0 = rewriter.create<mhlo::MulOp>(loc, x.getType(), x, b1);
-    b0 = rewriter.create<mhlo::SubtractOp>(loc, x.getType(), b0, b2);
-    b0 = rewriter.create<mhlo::AddOp>(
-        loc, x.getType(), b0, chlo::getConstantLike(rewriter, loc, c, x));
-  }
-  Value result = rewriter.create<mhlo::SubtractOp>(loc, x.getType(), b0, b2);
-  result = rewriter.create<mhlo::MulOp>(
-      loc, x.getType(), result, chlo::getConstantLike(rewriter, loc, 0.5, x));
-  return result;
-}
-
-template <typename FTy>
-Value materializeBesselI1eApproximation(ConversionPatternRewriter &rewriter,
-                                        Location loc, Value x,
-                                        ArrayRef<FTy> kI1eCoeffsA,
-                                        ArrayRef<FTy> kI1eCoeffsB) {
-  Value z = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value half = chlo::getConstantLike(rewriter, loc, 0.5, x);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value thirtyTwo = chlo::getConstantLike(rewriter, loc, 32.0, x);
-  Value eight = chlo::getConstantLike(rewriter, loc, 8.0, x);
-
-  Value tmp = rewriter.create<mhlo::MulOp>(loc, half, z);
-  tmp = rewriter.create<mhlo::SubtractOp>(loc, tmp, two);
-
-  Value xLe8 = materializeChebyshevPolynomialApproximation(rewriter, loc, tmp,
-                                                           kI1eCoeffsA);
-  xLe8 = rewriter.create<mhlo::MulOp>(loc, z, xLe8);
-
-  tmp = rewriter.create<mhlo::DivOp>(loc, thirtyTwo, z);
-  tmp = rewriter.create<mhlo::SubtractOp>(loc, tmp, two);
-  Value xGt8 = materializeChebyshevPolynomialApproximation(rewriter, loc, tmp,
-                                                           kI1eCoeffsB);
-  xGt8 = rewriter.create<mhlo::DivOp>(loc, xGt8,
-                                      rewriter.create<mhlo::SqrtOp>(loc, z));
-
-  Value isLe8 = rewriter.create<mhlo::CompareOp>(loc, z, eight,
-                                                 mhlo::ComparisonDirection::LE);
-
-  Value select = rewriter.create<mhlo::SelectOp>(loc, isLe8, xLe8, xGt8);
-  return rewriter.create<mhlo::MulOp>(
-      loc, rewriter.create<mhlo::SignOp>(loc, x), select);
-}
-
-Value materializeBesselI1eApproximationF32(ConversionPatternRewriter &rewriter,
-                                           Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kI1eCoeffsA[] = {
-      9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
-      2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
-      3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
-      4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
-      5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
-      4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
-      2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
-      1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
-      2.52587186443633654823E-1f};
-
-  const float kI1eCoeffsB[] = {
-      -3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
-      -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
-      -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
-      7.78576235018280120474E-1f};
-
-  return materializeBesselI1eApproximation<float>(rewriter, loc, x, kI1eCoeffsA,
-                                                  kI1eCoeffsB);
-}
-
-Value materializeBesselI1eApproximationF64(ConversionPatternRewriter &rewriter,
-                                           Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  const double kI1eCoeffsA[] = {
-      2.77791411276104639959E-18, -2.11142121435816608115E-17,
-      1.55363195773620046921E-16, -1.10559694773538630805E-15,
-      7.60068429473540693410E-15, -5.04218550472791168711E-14,
-      3.22379336594557470981E-13, -1.98397439776494371520E-12,
-      1.17361862988909016308E-11, -6.66348972350202774223E-11,
-      3.62559028155211703701E-10, -1.88724975172282928790E-9,
-      9.38153738649577178388E-9,  -4.44505912879632808065E-8,
-      2.00329475355213526229E-7,  -8.56872026469545474066E-7,
-      3.47025130813767847674E-6,  -1.32731636560394358279E-5,
-      4.78156510755005422638E-5,  -1.61760815825896745588E-4,
-      5.12285956168575772895E-4,  -1.51357245063125314899E-3,
-      4.15642294431288815669E-3,  -1.05640848946261981558E-2,
-      2.47264490306265168283E-2,  -5.29459812080949914269E-2,
-      1.02643658689847095384E-1,  -1.76416518357834055153E-1,
-      2.52587186443633654823E-1};
-
-  const double kI1eCoeffsB[] = {
-      7.51729631084210481353E-18,  4.41434832307170791151E-18,
-      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
-      2.96262899764595013876E-16,  3.30820231092092828324E-16,
-      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
-      1.04202769841288027642E-14,  4.27244001671195135429E-14,
-      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
-      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
-      1.41258074366137813316E-11,  3.25260358301548823856E-11,
-      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
-      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
-      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
-      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
-      7.78576235018280120474E-1};
-
-  return materializeBesselI1eApproximation<double>(rewriter, loc, x,
-                                                   kI1eCoeffsA, kI1eCoeffsB);
-}
-
-Value materializeWithUpcast(ConversionPatternRewriter &rewriter, Location loc,
-                            ValueRange args, FloatType minPrecisionTy,
-                            Value callback(ConversionPatternRewriter &,
-                                           Location, ValueRange)) {
-  auto originalTy = getElementTypeOrSelf(args.front().getType());
-  auto floatOriginalTy = originalTy.dyn_cast<FloatType>();
-  bool needsUpcast =
-      floatOriginalTy && floatOriginalTy.getWidth() < minPrecisionTy.getWidth();
-
-  // Upcast arguments if necessary.
-  llvm::SmallVector<Value, 2> castedArgs;
-  if (needsUpcast) {
-    for (Value a : args) {
-      castedArgs.push_back(
-          rewriter.create<mhlo::ConvertOp>(loc, a, minPrecisionTy));
-    }
-    args = castedArgs;
-  }
-
-  Value result = callback(rewriter, loc, args);
-
-  // Cast back if necessary.
-  if (needsUpcast) {
-    result = rewriter.create<mhlo::ConvertOp>(loc, result, originalTy);
-  }
-
-  return result;
-}
-
-struct ConvertBesselI1eOp : public OpConversionPattern<BesselI1eOp> {
-  using OpConversionPattern<BesselI1eOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      BesselI1eOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    // See https://www.tensorflow.org/api_docs/python/tf/math/bessel_i1e
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(
-          op, materializeBesselI1eApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeBesselI1eApproximationF32));
-    return success();
-  }
-};
-
-template <typename FTy>
-Value materializePolynomialApproximation(ConversionPatternRewriter &rewriter,
-                                         Location loc, Value x,
-                                         ArrayRef<FTy> coefficients) {
-  if (coefficients.empty()) return chlo::getConstantLike(rewriter, loc, 0.0, x);
-
-  Value poly = chlo::getConstantLike(rewriter, loc, coefficients[0], x);
-  for (size_t i = 1; i < coefficients.size(); ++i) {
-    poly = rewriter.create<mhlo::MulOp>(loc, x.getType(), poly, x);
-    poly = rewriter.create<mhlo::AddOp>(
-        loc, x.getType(), poly,
-        chlo::getConstantLike(rewriter, loc, coefficients[i], x));
-  }
-  return poly;
-}
-
-// Precondition is |x| >= 1. Use erf approximation, otherwise.
-//
-// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
-// argument and derive the final approximation for all |x| >= 1.
-// This implementation is based on Cephes.
-Value materializeErfcApproximationF64ForMagnituteGeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-  const double kMaxlog = 7.09782712893383996843E2;
-  const double kErfcPCoefficients[] = {
-      2.46196981473530512524E-10, 5.64189564831068821977E-1,
-      7.46321056442269912687E0,   4.86371970985681366614E1,
-      1.96520832956077098242E2,   5.26445194995477358631E2,
-      9.34528527171957607540E2,   1.02755188689515710272E3,
-      5.57535335369399327526E2};
-  const double kErfcQCoefficients[] = {
-      1.00000000000000000000E0, 1.32281951154744992508E1,
-      8.67072140885989742329E1, 3.54937778887819891062E2,
-      9.75708501743205489753E2, 1.82390916687909736289E3,
-      2.24633760818710981792E3, 1.65666309194161350182E3,
-      5.57535340817727675546E2};
-  const double kErfcRCoefficients[] = {
-      5.64189583547755073984E-1, 1.27536670759978104416E0,
-      5.01905042251180477414E0,  6.16021097993053585195E0,
-      7.40974269950448939160E0,  2.97886665372100240670E0};
-  const double kErfcSCoefficients[] = {
-      1.00000000000000000000E0, 2.26052863220117276590E0,
-      9.39603524938001434673E0, 1.20489539808096656605E1,
-      1.70814450747565897222E1, 9.60896809063285878198E0,
-      3.36907645100081516050E0};
-
-  // Let z = -x^2.
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value z = rewriter.create<mhlo::NegOp>(loc, xSq);
-
-  // Materialize polynomial approximation for x in [1, 8) as
-  //   erfc(x) = exp(z) P(|x|) / Q(|x|).
-  Value expZ = rewriter.create<mhlo::ExpOp>(loc, z);
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value polP = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcPCoefficients));
-  Value expZMulPolyP = rewriter.create<mhlo::MulOp>(loc, expZ, polP);
-  Value polQ = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcQCoefficients));
-  Value erfcApprox18 = rewriter.create<mhlo::DivOp>(loc, expZMulPolyP, polQ);
-
-  // Materialize polynomial approximation for x in >= 8 as
-  //   erfc(x) exp(z) R(|x|) / S(|x|).
-  Value polR = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcRCoefficients));
-  Value expZMulPolyR = rewriter.create<mhlo::MulOp>(loc, expZ, polR);
-  Value polS = materializePolynomialApproximation(
-      rewriter, loc, absX, llvm::ArrayRef(kErfcSCoefficients));
-  Value erfcApprox8Inf = rewriter.create<mhlo::DivOp>(loc, expZMulPolyR, polS);
-
-  // Combine polynomial approximations for x >= 1.
-  Value eight = chlo::getConstantLike(rewriter, loc, 8.0, x);
-  Value absXLt8 = rewriter.create<mhlo::CompareOp>(
-      loc, absX, eight, mhlo::ComparisonDirection::LT);
-  Value erfcApprox = rewriter.create<mhlo::SelectOp>(loc, absXLt8, erfcApprox18,
-                                                     erfcApprox8Inf);
-
-  // Clamp to prevent overflow and materialize approximation for large x as
-  //   erfc(x) = 0.
-  Value zLtNegMaxlog = rewriter.create<mhlo::CompareOp>(
-      loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x),
-      mhlo::ComparisonDirection::LT);
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value erfcApproxClamped =
-      rewriter.create<mhlo::SelectOp>(loc, zLtNegMaxlog, zero, erfcApprox);
-
-  // Derive approximation for x <= -1 as
-  //   erfc(x) = 2 - erfc(-x).
-  // Reuse previously materialized approximations all of which take |x| as their
-  // argument.
-  Value xLtZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LT);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value twoSubErfcApproxClamped =
-      rewriter.create<mhlo::SubtractOp>(loc, two, erfcApproxClamped);
-  return rewriter.create<mhlo::SelectOp>(loc, xLtZero, twoSubErfcApproxClamped,
-                                         erfcApproxClamped);
-}
-
-// Precondition is |x| <= 1. Use erfc approximation, otherwise.
-// This implementation is based on Cephes.
-Value materializeErfApproximationF64ForMagnituteLeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-  const double kErfTCoefficients[] = {
-      9.60497373987051638749E0, 9.00260197203842689217E1,
-      2.23200534594684319226E3, 7.00332514112805075473E3,
-      5.55923013010394962768E4};
-  const double kErfUCoefficients[] = {
-      1.00000000000000000000E0, 3.35617141647503099647E1,
-      5.21357949780152679795E2, 4.59432382970980127987E3,
-      2.26290000613890934246E4, 4.92673942608635921086E4};
-
-  // Materialize polynomial approximation for |x| <= 1 as
-  //   erf(x) = x T(x^2) / U(x^2).
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
-  Value xMulPolyT = rewriter.create<mhlo::MulOp>(loc, x, polyT);
-  Value polyU = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfUCoefficients));
-  return rewriter.create<mhlo::DivOp>(loc, xMulPolyT, polyU);
-}
-
-// This implementation is based on Cephes.
-Value materializeErfApproximationF64(ConversionPatternRewriter &rewriter,
-                                     Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  // Rely on erf approximation for |x| < 1
-  //   erf(x) = erf_approx(x)
-  Value erfApprox =
-      materializeErfApproximationF64ForMagnituteLeOne(rewriter, loc, x);
-
-  // Rely on erfc approximation for |x| >= 1 and materialize erf as
-  //   erf(x) = 1 - erfc_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfcApprox =
-      materializeErfcApproximationF64ForMagnituteGeOne(rewriter, loc, x);
-  Value erfcBasedApprox =
-      rewriter.create<mhlo::SubtractOp>(loc, one, erfcApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfApprox,
-                                         erfcBasedApprox);
-}
-
-Value materializeErfcApproximationF64(ConversionPatternRewriter &rewriter,
-                                      Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
-         "expect f64 element type");
-
-  // Rely on erfc approximation for |x| >= 1
-  //   erfc(x) = erfc_approx(x)
-  Value erfcApprox =
-      materializeErfcApproximationF64ForMagnituteGeOne(rewriter, loc, x);
-
-  // Rely on erf approximation for |x| < 1 and materialize erfc as
-  //   erfc(x) = 1 - erf_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfApprox =
-      materializeErfApproximationF64ForMagnituteLeOne(rewriter, loc, x);
-  Value erfBasedApprox = rewriter.create<mhlo::SubtractOp>(loc, one, erfApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfBasedApprox,
-                                         erfcApprox);
-}
-
-// Precondition is |x| >= 1. Use erf approximation, otherwise.
-//
-// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
-// argument and derive the final approximation for all |x| >= 1.
-// This implementation is based on Cephes.
-Value materializeErfcApproximationF32ForMagnitudeGeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const double kMaxlog = 88.72283905206835;
-  const float kErfcPCoefficients[] = {
-      +2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
-      -5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
-      +3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
-  };
-  const float kErfcRCoefficients[] = {
-      -1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
-      +2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
-      -2.820767439740514E-1, +5.641895067754075E-1,
-  };
-
-  // Let z = -x^2.
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value z = rewriter.create<mhlo::NegOp>(loc, xSq);
-
-  // Materialize polynomial approximation for x >= 1 as
-  //   erfc(x) = exp(z) 1/x P(1/x^2)   if x in [1, 2)
-  //   erfc(x) = exp(z) 1/x R(1/x^2)   if x >= 2
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value reciprocalXSq = rewriter.create<mhlo::DivOp>(loc, one, xSq);
-  Value expZ = rewriter.create<mhlo::ExpOp>(loc, z);
-  Value oneDivAbsX = rewriter.create<mhlo::DivOp>(loc, one, absX);
-  Value expZMulOneDivAbsX = rewriter.create<mhlo::MulOp>(loc, expZ, oneDivAbsX);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value absXLtTwo = rewriter.create<mhlo::CompareOp>(
-      loc, absX, two, mhlo::ComparisonDirection::LT);
-  Value polP = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcPCoefficients));
-  Value polR = materializePolynomialApproximation(
-      rewriter, loc, reciprocalXSq, llvm::ArrayRef(kErfcRCoefficients));
-  Value poly = rewriter.create<mhlo::SelectOp>(loc, absXLtTwo, polP, polR);
-  Value erfcApprox = rewriter.create<mhlo::MulOp>(loc, expZMulOneDivAbsX, poly);
-
-  // Clamp to prevent overflow and materialize approximation for large x as
-  //   erfc(x) = 0.
-  Value zLtNeqMaxlog = rewriter.create<mhlo::CompareOp>(
-      loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x),
-      mhlo::ComparisonDirection::LT);
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
-  Value erfcApproxClamped =
-      rewriter.create<mhlo::SelectOp>(loc, zLtNeqMaxlog, zero, erfcApprox);
-
-  // Derive approximation for x <= -1 as
-  //   erfc(x) = 2 - erfc(-x).
-  // Reuse previously materialized approximations all of which take |x| as their
-  // argument.
-  Value xLtZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LT);
-  Value twoSubErfcApprox =
-      rewriter.create<mhlo::SubtractOp>(loc, two, erfcApproxClamped);
-  return rewriter.create<mhlo::SelectOp>(loc, xLtZero, twoSubErfcApprox,
-                                         erfcApproxClamped);
-}
-
-// Precondition is |x| <= 1. Use erfc approximation, otherwise.
-// This implementation is based on Cephes.
-Value materializeErfApproximationF32ForMagnitudeLeOne(
-    ConversionPatternRewriter &rewriter, Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kErfTCoefficients[] = {
-      +7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
-      -2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
-      +1.128379165726710E+0,
-  };
-
-  // Materialize polynomial approximation for |x| <= 1 as
-  //   erf(x) = x T(x^2).
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-  Value polyT = materializePolynomialApproximation(
-      rewriter, loc, xSq, llvm::ArrayRef(kErfTCoefficients));
-  return rewriter.create<mhlo::MulOp>(loc, x, polyT);
-}
-
-// This is the same approximation as used in Eigen.
-Value materializeErfApproximationF32(ConversionPatternRewriter &rewriter,
-                                     Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-  const float kAlpha[] = {
-      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
-      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
-      -1.60960333262415e-02f,
-  };
-  const float kBeta[] = {
-      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
-      -7.37332916720468e-03f, -1.42647390514189e-02f,
-  };
-
-  // Clamp argument between -4 and 4.
-  Value lb = chlo::getConstantLike(rewriter, loc, -4.0, x);
-  Value ub = chlo::getConstantLike(rewriter, loc, 4.0, x);
-  x = rewriter.create<mhlo::ClampOp>(loc, x.getType(), lb, x, ub);
-  Value xSq = rewriter.create<mhlo::MulOp>(loc, x, x);
-
-  // Materialize polynomial approximation for x in [-4, 4] as
-  //   erf(x) = x * Alpha(x^2) / Beta(x^2).
-  Value alphaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
-                                                       llvm::ArrayRef(kAlpha));
-  Value betaPoly = materializePolynomialApproximation(rewriter, loc, xSq,
-                                                      llvm::ArrayRef(kBeta));
-  Value xMulAlphaPoly = rewriter.create<mhlo::MulOp>(loc, x, alphaPoly);
-  Value erf = rewriter.create<mhlo::DivOp>(loc, xMulAlphaPoly, betaPoly);
-  Value lbErf = chlo::getConstantLike(rewriter, loc, -1.0, x);
-  Value ubErf = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  return rewriter.create<mhlo::ClampOp>(loc, erf.getType(), lbErf, erf, ubErf);
-}
-
-Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
-                                      Location loc, ValueRange args) {
-  Value x = args.front();
-  assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
-         "expect f32 element type");
-
-  // Rely on erfc approximation for |x| >= 1
-  //   erfc(x) = erfc_approx(x)
-  Value erfcApprox =
-      materializeErfcApproximationF32ForMagnitudeGeOne(rewriter, loc, x);
-
-  // Rely on erf approximation for |x| < 1 and materialize erfc as
-  //   erfc(x) = 1 - erf_approx(x)
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value erfApprox =
-      materializeErfApproximationF32ForMagnitudeLeOne(rewriter, loc, x);
-  Value erfBasedApprox = rewriter.create<mhlo::SubtractOp>(loc, one, erfApprox);
-
-  // Materialize approximation selection based on argument.
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, erfBasedApprox,
-                                         erfcApprox);
-}
-
-struct BasisConvertErfOp : public OpConversionPattern<ErfOp> {
-  using OpConversionPattern<ErfOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(op, materializeErfApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeErfApproximationF32));
-    return success();
-  }
-};
-
-struct ConvertErfcOp : public OpConversionPattern<ErfcOp> {
-  using OpConversionPattern<ErfcOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfcOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value x = adaptor.getOperand();
-    Type ty = x.getType().cast<ShapedType>().getElementType();
-
-    // For now, we support only f64, f32, f16 and bf16.
-    if (!ty.isF64() && !ty.isF32() && !ty.isF16() && !ty.isBF16())
-      return failure();
-
-    if (ty.isF64()) {
-      rewriter.replaceOp(op, materializeErfcApproximationF64(rewriter, loc, x));
-      return success();
-    }
-
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeErfcApproximationF32));
-    return success();
-  }
-};
-
-Value erfInv32(ConversionPatternRewriter &b, Location loc, ValueRange args) {
-  constexpr int kDegree = 9;
-  constexpr std::array<float, 9> wLessThan5Constants = {
-      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
-      -0.00417768164f,  0.246640727f,    1.50140941f};
-  constexpr std::array<float, 9> wGreaterThan5Constants = {
-      -0.000200214257f, 0.000100950558f, 0.00134934322f,
-      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
-      0.00943887047f,   1.00167406f,     2.83297682f};
-
-  Value x = args[0];
-  // Compute logarithm of (1+arg) using log1p(arg) which is more precise than
-  // log(1+arg) when arg is close to zero. For more details, see
-  // https://en.cppreference.com/w/cpp/numeric/math/log1p
-  Value minusXSquared =
-      b.create<mhlo::MulOp>(loc, x, b.create<mhlo::NegOp>(loc, x));
-  Value w =
-      b.create<mhlo::NegOp>(loc, b.create<mhlo::Log1pOp>(loc, minusXSquared));
-
-  Value lt = b.create<mhlo::CompareOp>(loc, w, getConstantLike(b, loc, 5.0, x),
-                                       mhlo::ComparisonDirection::LT);
-  auto coefficient = [&](int i) {
-    return b.create<mhlo::SelectOp>(
-        loc, lt, getConstantLike(b, loc, wLessThan5Constants[i], x),
-        getConstantLike(b, loc, wGreaterThan5Constants[i], x));
-  };
-  w = b.create<mhlo::SelectOp>(
-      loc, lt,
-      b.create<mhlo::SubtractOp>(loc, w, getConstantLike(b, loc, 2.5, x)),
-      b.create<mhlo::SubtractOp>(loc, b.create<mhlo::SqrtOp>(loc, w),
-                                 getConstantLike(b, loc, 3.0, x)));
-  Value p = coefficient(0);
-  for (int i = 1; i < kDegree; ++i) {
-    p = b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w));
-  }
-
-  // Result modulo edge cases.
-  Value result = b.create<mhlo::MulOp>(loc, p, x);
-
-  // Handle edge cases, namely erfinv(+/-1) = +/-inf.  (The above computation is
-  // indeterminate, and can give nan or -/+inf.)
-  return b.create<mhlo::SelectOp>(
-      loc,
-      b.create<mhlo::CompareOp>(loc, b.create<mhlo::AbsOp>(loc, x),
-                                getConstantLike(b, loc, 1, x),
-                                mhlo::ComparisonDirection::EQ),
-      b.create<mhlo::MulOp>(loc, x, getConstantLikeInfValue(b, loc, x, false)),
-      result);
-}
-
-Value erfInv64(ConversionPatternRewriter &b, Location loc, ValueRange args) {
-  constexpr std::array<double, 23> wLessThan625Constants = {
-      -3.6444120640178196996e-21, -1.685059138182016589e-19,
-      1.2858480715256400167e-18,  1.115787767802518096e-17,
-      -1.333171662854620906e-16,  2.0972767875968561637e-17,
-      6.6376381343583238325e-15,  -4.0545662729752068639e-14,
-      -8.1519341976054721522e-14, 2.6335093153082322977e-12,
-      -1.2975133253453532498e-11, -5.4154120542946279317e-11,
-      1.051212273321532285e-09,   -4.1126339803469836976e-09,
-      -2.9070369957882005086e-08, 4.2347877827932403518e-07,
-      -1.3654692000834678645e-06, -1.3882523362786468719e-05,
-      0.0001867342080340571352,   -0.00074070253416626697512,
-      -0.0060336708714301490533,  0.24015818242558961693,
-      1.6536545626831027356};
-  constexpr std::array<double, 19> wLessThan16Constants = {
-      2.2137376921775787049e-09,  9.0756561938885390979e-08,
-      -2.7517406297064545428e-07, 1.8239629214389227755e-08,
-      1.5027403968909827627e-06,  -4.013867526981545969e-06,
-      2.9234449089955446044e-06,  1.2475304481671778723e-05,
-      -4.7318229009055733981e-05, 6.8284851459573175448e-05,
-      2.4031110387097893999e-05,  -0.0003550375203628474796,
-      0.00095328937973738049703,  -0.0016882755560235047313,
-      0.0024914420961078508066,   -0.0037512085075692412107,
-      0.005370914553590063617,    1.0052589676941592334,
-      3.0838856104922207635,
-  };
-  constexpr std::array<double, 17> wGreaterThan16Constants = {
-      -2.7109920616438573243e-11, -2.5556418169965252055e-10,
-      1.5076572693500548083e-09,  -3.7894654401267369937e-09,
-      7.6157012080783393804e-09,  -1.4960026627149240478e-08,
-      2.9147953450901080826e-08,  -6.7711997758452339498e-08,
-      2.2900482228026654717e-07,  -9.9298272942317002539e-07,
-      4.5260625972231537039e-06,  -1.9681778105531670567e-05,
-      7.5995277030017761139e-05,  -0.00021503011930044477347,
-      -0.00013871931833623122026, 1.0103004648645343977,
-      4.8499064014085844221,
-  };
-
-  Value x = args[0];
-  // Compute logarithm of (1+arg) using log1p(arg) which is more precise than
-  // log(1+arg) when arg is close to zero. For more details, see
-  // https://en.cppreference.com/w/cpp/numeric/math/log1p
-  Value minusXSquared =
-      b.create<mhlo::MulOp>(loc, x, b.create<mhlo::NegOp>(loc, x));
-  Value w =
-      b.create<mhlo::NegOp>(loc, b.create<mhlo::Log1pOp>(loc, minusXSquared));
-
-  Value lt625 = b.create<mhlo::CompareOp>(
-      loc, w, getConstantLike(b, loc, 6.25, x), mhlo::ComparisonDirection::LT);
-  Value lt16 = b.create<mhlo::CompareOp>(loc, w, getConstantLike(b, loc, 16, x),
-                                         mhlo::ComparisonDirection::LT);
-
-  auto coefficient = [&](int i) {
-    Value c = getConstantLike(b, loc, wLessThan625Constants[i], x);
-    if (i < 19) {
-      c = b.create<mhlo::SelectOp>(
-          loc, lt625, c, getConstantLike(b, loc, wLessThan16Constants[i], x));
-    }
-    if (i < 17) {
-      c = b.create<mhlo::SelectOp>(
-          loc, lt16, c, getConstantLike(b, loc, wGreaterThan16Constants[i], x));
-    }
-    return c;
-  };
-
-  Value sqrtW = b.create<mhlo::SqrtOp>(loc, w);
-  Value wMinus3125 =
-      b.create<mhlo::SubtractOp>(loc, w, getConstantLike(b, loc, 3.125, x));
-  Value select2 =
-      b.create<mhlo::SelectOp>(loc, lt16, getConstantLike(b, loc, 3.25, w),
-                               getConstantLike(b, loc, 5.0, w));
-  Value select2Result = b.create<mhlo::SubtractOp>(loc, sqrtW, select2);
-  w = b.create<mhlo::SelectOp>(loc, lt625, wMinus3125, select2Result);
-
-  Value p = coefficient(0);
-  for (int i = 1; i < 17; ++i) {
-    p = b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w));
-  }
-  for (int i = 17; i < 19; ++i) {
-    p = b.create<mhlo::SelectOp>(
-        loc, lt16,
-        b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w)),
-        p);
-  }
-  for (int i = 19; i < 23; ++i) {
-    p = b.create<mhlo::SelectOp>(
-        loc, lt625,
-        b.create<mhlo::AddOp>(loc, coefficient(i),
-                              b.create<mhlo::MulOp>(loc, p, w)),
-        p);
-  }
-
-  // Result modulo edge cases.
-  Value result = b.create<mhlo::MulOp>(loc, p, x);
-
-  // Handle edge cases, namely erfinv(+/-1) = +/-inf.  (The above computation is
-  // indeterminate, and can give nan or -/+inf.)
-  return b.create<mhlo::SelectOp>(
-      loc,
-      b.create<mhlo::CompareOp>(loc, b.create<mhlo::AbsOp>(loc, x),
-                                getConstantLike(b, loc, 1, x),
-                                mhlo::ComparisonDirection::EQ),
-      b.create<mhlo::MulOp>(loc, x, getConstantLikeInfValue(b, loc, x, false)),
-      result);
-}
-
-struct ConvertErfInvOp : public OpConversionPattern<ErfInvOp> {
-  using OpConversionPattern<ErfInvOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ErfInvOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    if (op.getResult().getType().getElementType().isF64()) {
-      rewriter.replaceOp(op, erfInv64(rewriter, loc, adaptor.getOperands()));
-      return success();
-    }
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &erfInv32));
-    return success();
-  }
-};
-
-// Coefficients for the Lanczos approximation of the gamma function. The
-// coefficients are uniquely determined by the choice of g and n (kLanczosGamma
-// and kLanczosCoefficients.size() + 1). The coefficients below correspond to
-// [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were evaluated and
-// [7, 9] seemed to be the least sensitive to the quality of the log function.
-// In particular, [5, 7] is the only choice where -1.5e-5 <= lgamma(2) <= 1.5e-5
-// for a particularly inaccurate log function.
-constexpr double kLanczosGamma = 7;  // aka g
-constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478;
-constexpr std::array<double, 8> kLanczosCoefficients = {
-    676.520368121885098567009190444019, -1259.13921672240287047156078755283,
-    771.3234287776530788486528258894,   -176.61502916214059906584551354,
-    12.507343278686904814458936853,     -0.13857109526572011689554707,
-    9.984369578019570859563e-6,         1.50563273514931155834e-7};
-
-// Compute the Lgamma function using Lanczos' approximation from "A Precision
-// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
-// series B. Vol. 1:
-//   lgamma(z + 1) = (log(2) + log(pi)) / 2
-//                     + (z + 1/2) * log(t(z))
-//                     - t(z) + log(a(z))
-//   with   t(z) = z + kLanczosGamma + 1/2
-//          a(z) = kBaseLanczosCoeff
-//                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-Value materializeLgamma(ConversionPatternRewriter &rewriter, Location loc,
-                        ValueRange args) {
-  // If the input is less than 0.5 use Euler's reflection formula.
-  //   gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-  // Let z be
-  //   z = -x      if x < 1/2
-  //   z = x - 1   otheriwse
-  Value x = args.front();
-  Value half = getConstantLike(rewriter, loc, 0.5, x);
-  Value needToReflect = rewriter.create<mhlo::CompareOp>(
-      loc, x, half, mhlo::ComparisonDirection::LT);
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1, x);
-  Value xSubOne = rewriter.create<mhlo::SubtractOp>(loc, x, one);
-  Value z = rewriter.create<mhlo::SelectOp>(loc, needToReflect, negX, xSubOne);
-
-  // Materialize
-  //   a(z) = kBaseLanczosCoeff
-  //            + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-  Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
-  for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
-    Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
-    Value oneBasedIndex = getConstantLike(rewriter, loc, i + 1, x);
-    Value quotient = rewriter.create<mhlo::DivOp>(
-        loc, coeff, rewriter.create<mhlo::AddOp>(loc, z, oneBasedIndex));
-    a = rewriter.create<mhlo::AddOp>(loc, a, quotient);
-  }
-
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
-  // device.
-  // Materialize as
-  //   log(t) = log(kLanczosGamma + 1/2 + z)
-  //          = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
-  Value lanczosPlusHalf =
-      getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
-  Value t = rewriter.create<mhlo::AddOp>(loc, lanczosPlusHalf, z);
-  Value logTerm =
-      getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
-  Value log1pTerm = rewriter.create<mhlo::Log1pOp>(
-      loc, rewriter.create<mhlo::DivOp>(loc, z, lanczosPlusHalf));
-  Value logT = rewriter.create<mhlo::AddOp>(loc, logTerm, log1pTerm);
-
-  // Note that t(z) may be large and we need to be careful not to overflow to
-  // infinity in the relevant term
-  //   r = (z + 1/2) * log(t(z)) - t(z).
-  // Therefore, we compute this as
-  //   r = (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
-  Value tDivLogT = rewriter.create<mhlo::DivOp>(loc, t, logT);
-  Value sum = rewriter.create<mhlo::SubtractOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, z, half), tDivLogT);
-  Value r = rewriter.create<mhlo::MulOp>(loc, sum, logT);
-
-  // Compute the final result (modulo reflection) as
-  //   lgamma(z + 1) = (log(2) + log(pi)) / 2 + r + log(a(z)).
-  Value logA = rewriter.create<mhlo::LogOp>(loc, a);
-  Value lgamma = rewriter.create<mhlo::AddOp>(
-      loc,
-      rewriter.create<mhlo::AddOp>(
-          loc,
-          getConstantLike(rewriter, loc, (std::log(2) + std::log(M_PI)) / 2, x),
-          r),
-      logA);
-
-  // Compute the reflected value for x < 0.5 as
-  //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
-  //
-  // The abs is needed because lgamma is the log of the absolute value of the
-  // gamma function.
-  //
-  // We have to be careful when computing the final term above. gamma(x) goes
-  // to +/-inf at every integer x < 0, and this is controlled by the sin(pi * x)
-  // term. The slope is large, so precision is particularly important.
-  //
-  // Because abs(sin(pi * x)) has period of 1 we can equivalently use
-  // abs(sin(pi * frac(x))) where frac(x) is the fractional part of x. This is
-  // more numerically accurate: It doesn't overflow to inf like pi * x would and
-  // if x is an integer it evaluates to exactly 0 which is important because we
-  // then take the log of this value, and log(0) is inf.
-  //
-  // We don't have a frac(x) primitive in HLO and computing it is tricky, but
-  // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for our
-  // purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
-  //
-  // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
-  // to 1. To remedy this, we can use the fact that sin(pi * x) in the domain
-  // [0, 1] is symmetric across the line Y=0.5.
-  //
-
-  // Convert values of abs_frac > 0.5 to (1 - abs_frac) to improve precision of
-  // pi * abs_frac for values of abs_frac close to 1.
-  Value abs = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absFrac = rewriter.create<mhlo::SubtractOp>(
-      loc, abs, rewriter.create<mhlo::FloorOp>(loc, abs));
-  Value reduceAbsFrac = rewriter.create<mhlo::CompareOp>(
-      loc, half, absFrac, mhlo::ComparisonDirection::LT);
-  absFrac = rewriter.create<mhlo::SelectOp>(
-      loc, reduceAbsFrac, rewriter.create<mhlo::SubtractOp>(loc, one, absFrac),
-      absFrac);
-
-  // Materialize reflection.
-  Value reflectionDenom = rewriter.create<mhlo::LogOp>(
-      loc,
-      rewriter.create<mhlo::SineOp>(
-          loc, rewriter.create<mhlo::MulOp>(
-                   loc, getConstantLike(rewriter, loc, M_PI, x), absFrac)));
-  Value lgammaReflection = rewriter.create<mhlo::SubtractOp>(
-      loc,
-      rewriter.create<mhlo::SubtractOp>(
-          loc, getConstantLike(rewriter, loc, std::log(M_PI), x),
-          reflectionDenom),
-      lgamma);
-
-  // Avoid computing -inf - inf, which is nan. If reflection_denom is +/-inf,
-  // then it "wins" and the result is +/-inf.
-  Value finiteReflectionDenom =
-      rewriter.create<mhlo::IsFiniteOp>(loc, reflectionDenom);
-  Value negReflectionDenom = rewriter.create<mhlo::NegOp>(loc, reflectionDenom);
-  lgammaReflection = rewriter.create<mhlo::SelectOp>(
-      loc, finiteReflectionDenom, lgammaReflection, negReflectionDenom);
-
-  // Select whether or not to rely on the reflection.
-  lgamma = rewriter.create<mhlo::SelectOp>(loc, needToReflect, lgammaReflection,
-                                           lgamma);
-
-  // Materialize +/-inf behavior as
-  //   lgamma(+/-inf) = +inf.
-  Value xIsInf = rewriter.create<chlo::IsInfOp>(loc, x);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, xIsInf,
-      chlo::getConstantLikeInfValue(rewriter, loc, x, /*negative=*/false),
-      lgamma);
-}
-
-// Express `cosh` as
-//   cosh(x) = (e^x + e^-x) / 2
-//           = e^(x + log(1/2)) + e^(-x + log(1/2))
-//
-// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not.
-//
-// This incorrectly overflows to inf for two f32 input values, namely
-// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
-// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
-// we deem this acceptable.
-Value materializeCoshApproximation(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-  CoshOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  Value logOneHalf =
-      rewriter.create<mhlo::LogOp>(loc, getConstantLike(rewriter, loc, 0.5, x));
-  Value expAdd = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, x, logOneHalf));
-  Value expSub = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::SubtractOp>(loc, logOneHalf, x));
-  return rewriter.create<mhlo::AddOp>(loc, expAdd, expSub);
-}
-
-struct ConvertCoshOp : public OpConversionPattern<CoshOp> {
-  using OpConversionPattern<CoshOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      CoshOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeCoshApproximation));
-    return success();
-  }
-};
-
-// Compute the Digamma function using Lanczos' approximation from "A Precision
-// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
-// series B. Vol. 1:
-//   digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z)
-//   with   t(z) = z + kLanczosGamma + 1/2
-//          a(z) = kBaseLanczosCoeff
-//                   + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-//          a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
-Value materializeDigamma(ConversionPatternRewriter &rewriter, Location loc,
-                         ValueRange args) {
-  // If the input is less than 0.5 use Euler's reflection formula.
-  //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  // Let z be
-  //   z = -x      if x < 1/2
-  //   z = x - 1   otheriwse
-  Value x = args.front();
-  Value half = getConstantLike(rewriter, loc, 0.5, x);
-  Value needToReflect = rewriter.create<mhlo::CompareOp>(
-      loc, x, half, mhlo::ComparisonDirection::LT);
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1, x);
-  Value xSubOne = rewriter.create<mhlo::SubtractOp>(loc, x, one);
-  Value z = rewriter.create<mhlo::SelectOp>(loc, needToReflect, negX, xSubOne);
-
-  // Materialize
-  //   a(z) = kBaseLanczosCoeff
-  //            + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
-  //   a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
-  Value zero = getConstantLike(rewriter, loc, 0.0, x);
-  Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
-  Value aPrime = zero;
-  for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
-    Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
-    Value oneBasedIndex = getConstantLike(rewriter, loc, i + 1, x);
-    Value zTerm = rewriter.create<mhlo::AddOp>(loc, z, oneBasedIndex);
-    aPrime = rewriter.create<mhlo::SubtractOp>(
-        loc, aPrime,
-        rewriter.create<mhlo::DivOp>(
-            loc, coeff, rewriter.create<mhlo::MulOp>(loc, zTerm, zTerm)));
-    a = rewriter.create<mhlo::AddOp>(
-        loc, a, rewriter.create<mhlo::DivOp>(loc, coeff, zTerm));
-  }
-
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
-  // device.
-  // Materialize as
-  //   log(t) = log(kLanczosGamma + 1/2 + z)
-  //          = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
-  Value lanczosPlusHalf =
-      getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
-  Value t = rewriter.create<mhlo::AddOp>(loc, lanczosPlusHalf, z);
-  Value logTerm =
-      getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
-  Value log1pTerm = rewriter.create<mhlo::Log1pOp>(
-      loc, rewriter.create<mhlo::DivOp>(loc, z, lanczosPlusHalf));
-  Value logT = rewriter.create<mhlo::AddOp>(loc, logTerm, log1pTerm);
-
-  // Materialize the final result (modulo reflection) as
-  //   digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z).
-  Value aPrimeDivA = rewriter.create<mhlo::DivOp>(loc, aPrime, a);
-  Value lanczosGammaDivT = rewriter.create<mhlo::DivOp>(
-      loc, getConstantLike(rewriter, loc, kLanczosGamma, x), t);
-  Value digamma = rewriter.create<mhlo::SubtractOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, logT, aPrimeDivA),
-      lanczosGammaDivT);
-
-  // We need to be careful how we compute cot(pi * input) below: For
-  // near-integral arguments, pi * input can lose precision.
-  //
-  // Input is already known to be less than 0.5 (otherwise we don't have to
-  // reflect). We shift values smaller than -0.5 into the range [-0.5, 0.5] to
-  // increase precision of pi * x and the resulting cotangent.
-  Value reducedX = rewriter.create<mhlo::AddOp>(
-      loc, x,
-      rewriter.create<mhlo::AbsOp>(
-          loc, rewriter.create<mhlo::FloorOp>(
-                   loc, rewriter.create<mhlo::AddOp>(
-                            loc, x, getConstantLike(rewriter, loc, 0.5, x)))));
-
-  // Materialize reflection for inputs less than 0.5 as
-  //   digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  //              = digamma(1 - x) - pi * cos(pi * x) / sin(pi * x)
-  Value pi = getConstantLike(rewriter, loc, M_PI, x);
-  Value piMulReducedX = rewriter.create<mhlo::MulOp>(loc, pi, reducedX);
-  Value cos = rewriter.create<mhlo::CosineOp>(loc, piMulReducedX);
-  Value sin = rewriter.create<mhlo::SineOp>(loc, piMulReducedX);
-  Value reflection = rewriter.create<mhlo::SubtractOp>(
-      loc, digamma,
-      rewriter.create<mhlo::DivOp>(
-          loc, rewriter.create<mhlo::MulOp>(loc, pi, cos), sin));
-
-  // Select whether or not to rely on the reflection.
-  digamma =
-      rewriter.create<mhlo::SelectOp>(loc, needToReflect, reflection, digamma);
-
-  // Digamma has poles at negative integers and zero; return nan for those.
-  Value isLeZero = rewriter.create<mhlo::CompareOp>(
-      loc, x, zero, mhlo::ComparisonDirection::LE);
-  Value isInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::EQ);
-  Value isPole = rewriter.create<mhlo::AndOp>(loc, isLeZero, isInt);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, isPole,
-      getConstantLike(rewriter, loc, std::numeric_limits<double>::quiet_NaN(),
-                      x),
-      digamma);
-}
-
-Value materializeZeta(ConversionPatternRewriter &rewriter, Location loc,
-                      ValueRange args) {
-  // Implementation ported from:
-  // https://github.com/openxla/xla/blob/7a067a7b88d2ffb15b1dc5e3c06f701a15f0391d/xla/client/lib/math.cc#L1912-L1917
-  // Reference: Johansson, Fredrik.
-  // "Rigorous high-precision computation of the Hurwitz zeta function and its
-  // derivatives." Numerical Algorithms 69.2 (2015): 253-270.
-  // https://arxiv.org/abs/1309.2877 - formula (5)
-  // Notation is more or less kept as a reference to the whitepaper.
-  assert(args.size() == 2);
-  Value x = args[0];
-  Value q = args[1];
-  static const std::array<double, 12> kZetaCoeffs{
-      -7.1661652561756670113e18,
-      1.8152105401943546773e17,
-      -4.5979787224074726105e15,
-      1.1646782814350067249e14,
-      -2.950130727918164224e12,
-      7.47242496e10,
-      -1.8924375803183791606e9,
-      47900160.0,
-      -1209600.0,
-      30240.0,
-      -720.0,
-      12.0,
-  };
-
-  // For speed we'll always use 9 iterations for the initial series estimate,
-  // and a 12 term expansion for the Euler-Maclaurin formula.
-  Value a = q;
-  Value zero = chlo::getConstantLike(rewriter, loc, 0.0, a);
-  Value negPower = zero;
-  Value negX = rewriter.create<mhlo::NegOp>(loc, x);
-  Value initialSum = rewriter.create<mhlo::PowOp>(loc, q, negX);
-  Value one = chlo::getConstantLike(rewriter, loc, 1.0, a);
-  for (int i = 0; i < 9; ++i) {
-    a = rewriter.create<mhlo::AddOp>(loc, a, one);
-    negPower = rewriter.create<mhlo::PowOp>(loc, a, negX);
-    initialSum = rewriter.create<mhlo::AddOp>(loc, initialSum, negPower);
-  }
-  a = rewriter.create<mhlo::AddOp>(loc, a, one);
-  negPower = rewriter.create<mhlo::PowOp>(loc, a, negX);
-  Value oneLikeX = chlo::getConstantLike(rewriter, loc, 1.0, x);
-  Value xMinusOne = rewriter.create<mhlo::SubtractOp>(loc, x, oneLikeX);
-  Value negPowerMulA = rewriter.create<mhlo::MulOp>(loc, negPower, a);
-  Value negPowerMulADivXMinusOne =
-      rewriter.create<mhlo::DivOp>(loc, negPowerMulA, xMinusOne);
-  Value s =
-      rewriter.create<mhlo::AddOp>(loc, initialSum, negPowerMulADivXMinusOne);
-  Value aInverseSquare = rewriter.create<mhlo::DivOp>(
-      loc, one, rewriter.create<mhlo::MulOp>(loc, a, a));
-
-  Value hornerSum = zero;
-  Value factor = one;
-  // Use Horner's rule for this.
-  // Note this differs from Cephes which does a 'naive' polynomial evaluation.
-  // Using Horner's rule allows to avoid some NaN's and Infs from happening,
-  // resulting in more numerically stable code.
-  for (int i = 0; i < 11; ++i) {
-    Value factorLhs = rewriter.create<mhlo::AddOp>(
-        loc, x, chlo::getConstantLike(rewriter, loc, 22 - 2 * i, x));
-    Value factorRhs = rewriter.create<mhlo::AddOp>(
-        loc, x, chlo::getConstantLike(rewriter, loc, 21 - 2 * i, x));
-    factor = rewriter.create<mhlo::MulOp>(loc, factorLhs, factorRhs);
-    hornerSum = rewriter.create<mhlo::MulOp>(
-        loc, factor,
-        rewriter.create<mhlo::MulOp>(
-            loc, aInverseSquare,
-            rewriter.create<mhlo::AddOp>(
-                loc, hornerSum,
-                chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
-  }
-  Value zeroPointFiveLikeNegPower =
-      chlo::getConstantLike(rewriter, loc, .5, negPower);
-  Value xDivA = rewriter.create<mhlo::DivOp>(loc, x, a);
-  s = rewriter.create<mhlo::AddOp>(
-      loc, s,
-      rewriter.create<mhlo::MulOp>(
-          loc, negPower,
-          rewriter.create<mhlo::AddOp>(
-              loc, zeroPointFiveLikeNegPower,
-              rewriter.create<mhlo::MulOp>(
-                  loc, xDivA,
-                  rewriter.create<mhlo::AddOp>(
-                      loc,
-                      chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11],
-                                            a),
-                      hornerSum)))));
-
-  // Use the initial zeta sum without the correction term coming
-  // from Euler-Maclaurin if it is accurate enough.
-  Value absNegPower = rewriter.create<mhlo::AbsOp>(loc, negPower);
-  Value absInitialSum = rewriter.create<mhlo::AbsOp>(loc, initialSum);
-  Value output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(
-          loc, absNegPower,
-          rewriter.create<mhlo::MulOp>(
-              loc, absInitialSum,
-              chlo::getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
-          mhlo::ComparisonDirection::LT),
-      initialSum, s);
-
-  // Function is not defined for x < 1.
-  Value nan = chlo::getConstantLike(
-      rewriter, loc, std::numeric_limits<double>::quiet_NaN(), x);
-  output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(loc, x, oneLikeX,
-                                       mhlo::ComparisonDirection::LT),
-      nan, output);
-
-  // For q <= 0, x must be an integer.
-  Value qLeZero = rewriter.create<mhlo::CompareOp>(
-      loc, q, zero, mhlo::ComparisonDirection::LE);
-  Value xNotInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::NE);
-  Value xDomainError = rewriter.create<mhlo::AndOp>(loc, qLeZero, xNotInt);
-  output = rewriter.create<mhlo::SelectOp>(loc, xDomainError, nan, output);
-
-  // For all integer q <= 0, zeta has a pole. The limit is only defined as
-  // +inf if x is and even integer.
-  Value inf = chlo::getConstantLike(rewriter, loc,
-                                    std::numeric_limits<double>::infinity(), x);
-  Value qIsInt = rewriter.create<mhlo::CompareOp>(
-      loc, q, rewriter.create<mhlo::FloorOp>(loc, q),
-      mhlo::ComparisonDirection::EQ);
-  Value atPole = rewriter.create<mhlo::AndOp>(loc, qLeZero, qIsInt);
-  Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
-  Value xIsInt = rewriter.create<mhlo::CompareOp>(
-      loc, x, rewriter.create<mhlo::FloorOp>(loc, x),
-      mhlo::ComparisonDirection::EQ);
-  Value xIsEven = rewriter.create<mhlo::CompareOp>(
-      loc, rewriter.create<mhlo::RemOp>(loc, x, two), zero,
-      mhlo::ComparisonDirection::EQ);
-  Value xIsEvenInt = rewriter.create<mhlo::AndOp>(loc, xIsInt, xIsEven);
-  output = rewriter.create<mhlo::SelectOp>(
-      loc, atPole, rewriter.create<mhlo::SelectOp>(loc, xIsEvenInt, inf, nan),
-      output);
-
-  // For x = 1, this is the harmonic series and diverges.
-  output = rewriter.create<mhlo::SelectOp>(
-      loc,
-      rewriter.create<mhlo::CompareOp>(loc, x, one,
-                                       mhlo::ComparisonDirection::EQ),
-      inf, output);
-
-  return output;
-}
-
-Value materializePolygamma(ConversionPatternRewriter &rewriter, Location loc,
-                           ValueRange args) {
-  PolygammaOp::Adaptor transformed(args);
-  Value n = transformed.getN();
-  Value x = transformed.getX();
-
-  // Handle integer n > 0.
-  Value one = getConstantLike(rewriter, loc, 1.0, x);
-  Value two = getConstantLike(rewriter, loc, 2.0, x);
-  Value sign = rewriter.create<mhlo::SubtractOp>(
-      loc,
-      rewriter.create<mhlo::MulOp>(loc, two,
-                                   rewriter.create<mhlo::RemOp>(loc, n, two)),
-      one);
-  Value nPlusOne = rewriter.create<mhlo::AddOp>(loc, n, one);
-  Value expLgammaNp1 = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<chlo::LgammaOp>(loc, nPlusOne));
-  Value zeta = rewriter.create<chlo::ZetaOp>(loc, nPlusOne, x);
-  Value result = rewriter.create<mhlo::MulOp>(
-      loc, rewriter.create<mhlo::MulOp>(loc, sign, expLgammaNp1), zeta);
-
-  // Handle n = 0.
-  Value zero = getConstantLike(rewriter, loc, 0.0, x);
-  Value nEqZero = rewriter.create<mhlo::CompareOp>(
-      loc, n, zero, mhlo::ComparisonDirection::EQ);
-  result = rewriter.create<mhlo::SelectOp>(
-      loc, nEqZero, rewriter.create<chlo::DigammaOp>(loc, x), result);
-
-  // Check that n is a natural number. Return nan, otherwise.
-  Value nonInt = rewriter.create<mhlo::CompareOp>(
-      loc, n, rewriter.create<mhlo::FloorOp>(loc, n),
-      mhlo::ComparisonDirection::NE);
-  Value negative = rewriter.create<mhlo::CompareOp>(
-      loc, n, zero, mhlo::ComparisonDirection::LT);
-  Value nonNatural = rewriter.create<mhlo::OrOp>(loc, nonInt, negative);
-  return rewriter.create<mhlo::SelectOp>(
-      loc, nonNatural,
-      getConstantLike(rewriter, loc, std::numeric_limits<double>::quiet_NaN(),
-                      x),
-      result);
-}
-
-struct ConvertLgammaOp : public OpConversionPattern<LgammaOp> {
-  using OpConversionPattern<LgammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      LgammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  minPrecisionTy, &materializeLgamma));
-    return success();
-  }
-};
-
-struct ConvertDigammaOp : public OpConversionPattern<DigammaOp> {
-  using OpConversionPattern<DigammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      DigammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  minPrecisionTy, &materializeDigamma));
-    return success();
-  }
-};
-
-Value materializeNextAfter(ConversionPatternRewriter &rewriter, Location loc,
-                           ValueRange operands) {
-  NextAfterOp::Adaptor transformed(operands);
-  Value x = transformed.getX();
-  Value y = transformed.getY();
-  auto resultTy = x.getType().cast<ShapedType>();
-  auto bitwidth = resultTy.getElementType().getIntOrFloatBitWidth();
-  ImplicitLocOpBuilder b(loc, rewriter);
-  auto intTy = resultTy.clone(b.getIntegerType(bitwidth));
-  auto xAsInt = b.create<mhlo::BitcastConvertOp>(intTy, x);
-  auto yAsInt = b.create<mhlo::BitcastConvertOp>(intTy, y);
-
-  // The result is NaN if either "x" or "y" are NaN.
-  auto xIsNan = b.create<mhlo::CompareOp>(x, x, mhlo::ComparisonDirection::NE);
-  auto yIsNan = b.create<mhlo::CompareOp>(y, y, mhlo::ComparisonDirection::NE);
-  auto nanInput = b.create<mhlo::OrOp>(xIsNan, yIsNan);
-  auto resultForNan = getConstantLike(
-      rewriter, loc, std::numeric_limits<double>::quiet_NaN(), x);
-  auto resultForNanAsInt =
-      b.create<mhlo::BitcastConvertOp>(intTy, resultForNan);
-
-  // The sign bit is the MSB.
-  const int64_t signBit = int64_t{1} << (bitwidth - 1);
-  // Discard the sign bit to make the result non-negative.
-  auto signMask = getConstantLike(rewriter, loc, signBit, xAsInt);
-  auto negatedSignMask = getConstantLike(rewriter, loc, ~signBit, xAsInt);
-  auto xAbs = b.create<mhlo::AndOp>(xAsInt, negatedSignMask);
-  auto yAbs = b.create<mhlo::AndOp>(yAsInt, negatedSignMask);
-
-  // When both "x" and "y" are equal, the result is "y".
-  auto xAndYAreEqual =
-      b.create<mhlo::CompareOp>(x, y, mhlo::ComparisonDirection::EQ);
-  auto resultForEqual = yAsInt;
-
-  // When both "x" and "y" are 0, the result is "y". This is a separate case
-  // from above because "x" and "y" might have a different sign.
-  auto zero = getConstantLike(rewriter, loc, 0, xAsInt);
-  auto xIsZero =
-      b.create<mhlo::CompareOp>(xAbs, zero, mhlo::ComparisonDirection::EQ);
-  auto yIsZero =
-      b.create<mhlo::CompareOp>(yAbs, zero, mhlo::ComparisonDirection::EQ);
-  auto resultForBothZero = yAsInt;
-
-  auto xSign = b.create<mhlo::AndOp>(xAsInt, signMask);
-  auto ySign = b.create<mhlo::AndOp>(yAsInt, signMask);
-
-  // If from == 0 && to != 0, we need to return the smallest subnormal number
-  // signed like "to".
-  auto one = getConstantLike(rewriter, loc, 1, xAsInt);
-  auto resultForXZeroYNonZero = b.create<mhlo::OrOp>(ySign, one);
-
-  // If the sign of "x" and "y" disagree:
-  // - we need to make the magnitude of "from" smaller so that it is closer to
-  //   zero.
-  //
-  // Otherwise the signs agree:
-  // - "x" with a magnitude larger than "y" means we need to make the magnitude
-  //   smaller.
-  // - "x" with a magnitude smaller than "y" means we need to make the magnitude
-  //   larger.
-  auto signsDisagree =
-      b.create<mhlo::CompareOp>(xSign, ySign, mhlo::ComparisonDirection::NE);
-  auto xMagnitudeLargerThanY =
-      b.create<mhlo::CompareOp>(xAbs, yAbs, mhlo::ComparisonDirection::GT);
-  auto resultHasSmallerMagnitude =
-      b.create<mhlo::OrOp>(xMagnitudeLargerThanY, signsDisagree);
-  auto minusOne = getConstantLike(rewriter, loc, -1, xAsInt);
-  auto magnitudeAdjustment =
-      b.create<mhlo::SelectOp>(resultHasSmallerMagnitude, minusOne, one);
-  Value result = b.create<mhlo::AddOp>(xAsInt, magnitudeAdjustment);
-  // Handle from == +-0.
-  result = b.create<mhlo::SelectOp>(
-      xIsZero,
-      b.create<mhlo::SelectOp>(yIsZero, resultForBothZero,
-                               resultForXZeroYNonZero),
-      result);
-  // Handle from == to.
-  result = b.create<mhlo::SelectOp>(xAndYAreEqual, resultForEqual, result);
-  // Handle isnan(x) || isnan(y).
-  result = b.create<mhlo::SelectOp>(nanInput, resultForNanAsInt, result);
-
-  // Cast back to the original type.
-  return b.create<mhlo::BitcastConvertOp>(resultTy, result);
-}
-
-struct ConvertNextAfterOp : public OpConversionPattern<NextAfterOp> {
-  using OpConversionPattern<NextAfterOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      NextAfterOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOp(
-        op, materializeNextAfter(rewriter, op.getLoc(), adaptor.getOperands()));
-    return success();
-  }
-};
-
-struct ConvertPolygammaOp : public OpConversionPattern<PolygammaOp> {
-  using OpConversionPattern<PolygammaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      PolygammaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &materializePolygamma));
-    return success();
-  }
-};
-
-// Sinh(x) = (e^x - e^-x) / 2
-//         = e^(x + log(1/2)) - e^(-x + log(1/2)).
-//
-// The second formulation avoids overflowing when e^x = inf but (e^x)/2 is not
-// inf.
-//
-// This incorrectly overflows to +/-inf for two f32 input values, namely
-// +/-89.4159851, due to rounding error when computing x +/- log(1/2).  The
-// correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
-// we deem this acceptable.
-Value materializeSinhApproximationForLargeX(ConversionPatternRewriter &rewriter,
-                                            Location loc, ValueRange operands) {
-  SinhOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  Value logOneHalf =
-      rewriter.create<mhlo::LogOp>(loc, getConstantLike(rewriter, loc, 0.5, x));
-  Value expAdd = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::AddOp>(loc, x, logOneHalf));
-  Value expSub = rewriter.create<mhlo::ExpOp>(
-      loc, rewriter.create<mhlo::SubtractOp>(loc, logOneHalf, x));
-  return rewriter.create<mhlo::SubtractOp>(loc, expAdd, expSub);
-}
-
-// Express `sinh` as
-//   sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
-//           = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
-Value materializeSinhApproximation(ConversionPatternRewriter &rewriter,
-                                   Location loc, ValueRange operands) {
-  Value largeSinhResult =
-      materializeSinhApproximationForLargeX(rewriter, loc, operands);
-
-  SinhOp::Adaptor transformed(operands);
-  Value x = transformed.getOperand();
-
-  // For smaller x, we get unwanted cancellations of e^x - e^-x, resulting in
-  // 0.
-  // Rewrite this to avoid that. We use expm1(x) because that preserves the
-  // first order term of the taylor series of e^x.
-  // (e^(x) - e^(-x)) / 2. =
-  // (e^(x) - 1 + 1 - e^(-x)) / 2.
-  // (expm1(x) + (e^(x) - 1) / e^x) / 2.
-  // (expm1(x) + expm1(x) / (expm1(x) + 1)) / 2.
-  Value expm1 = rewriter.create<mhlo::Expm1Op>(loc, x);
-  Value one = getConstantLike(rewriter, loc, 1.0, x);
-  Value oneHalf = getConstantLike(rewriter, loc, 0.5, x);
-  Value expm1PlusOne = rewriter.create<mhlo::AddOp>(loc, expm1, one);
-  Value ratio = rewriter.create<mhlo::DivOp>(loc, expm1, expm1PlusOne);
-  Value sum = rewriter.create<mhlo::AddOp>(loc, expm1, ratio);
-  Value smallSinhResult = rewriter.create<mhlo::MulOp>(loc, oneHalf, sum);
-
-  Value absX = rewriter.create<mhlo::AbsOp>(loc, x);
-  Value absXLtOne = rewriter.create<mhlo::CompareOp>(
-      loc, absX, one, mhlo::ComparisonDirection::LT);
-  return rewriter.create<mhlo::SelectOp>(loc, absXLtOne, smallSinhResult,
-                                         largeSinhResult);
-}
-
-struct ConvertSinhOp : public OpConversionPattern<SinhOp> {
-  using OpConversionPattern<SinhOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      SinhOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Value x = adaptor.getOperand();
-    if (x.getType().cast<ShapedType>().getElementType().isa<ComplexType>()) {
-      rewriter.replaceOp(op, materializeSinhApproximationForLargeX(
-                                 rewriter, op.getLoc(), adaptor.getOperands()));
-      return success();
-    }
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, op.getLoc(), adaptor.getOperands(),
-                                  rewriter.getF32Type(),
-                                  &materializeSinhApproximation));
-    return success();
-  }
-};
-
-// Converts chlo.top_k to MHLO iota, sort, and slice ops.
-//
-// chlo.top_k sorts along last dimension of the input tensor and then returns
-// the top K components' values and indices. This is translated into a few
-// ops in MHLO: first generating an integer sequence for the indices,
-// then sort both the original input tensor and the indices togheter, and
-// at last slice out the top K components.
-//
-// For example, for the following IR:
-//
-// %0:2 = "chlo.top_k"(%input, k=8): tensor<16x16xf32> ->
-//                                   (tensor<16x8xf32>, tensor<16x8xi32>)
-//
-// We will get:
-//
-// %1 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<16x16xi32>
-// %2 = "mhlo.sort"(%input, %1) ({
-// ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>,
-//      %arg3: tensor<i32>, %arg4: tensor<i32>):
-//   %7 = "mhlo.compare"(%arg1, %arg2) {comparison_direction = "GT"}: ...
-//   "mhlo.return"(%7) : (tensor<i1>) -> ()
-// }) {dimension = 1 : i64, is_stable = true} : ...
-// %3 = "mhlo.get_tuple_element"(%2) {index = 0 : i32} : ...
-// %4 = "mhlo.get_tuple_element"(%2) {index = 1 : i32} : ...
-// %5 = "mhlo.slice"(%3) {limit_indices = dense<[16, 8]> : tensor<2xi64>,
-//                           start_indices dense<0> : tensor<2xi64>,
-//                           strides = dense<1> : tensor<2xi64>} :
-//                              (tensor<16x16xf32>) -> tensor<16x8xf32>
-// %6 = "mhlo.slice"(%4) ...
-//
-struct BasisConvertTopKOp : public OpConversionPattern<TopKOp> {
-  using OpConversionPattern<TopKOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      TopKOp op, OpAdaptor /*adaptor*/,
-      ConversionPatternRewriter &rewriter) const override {
-    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
-    if (!operandType) return failure();
-    int64_t operandRank = operandType.getRank();
-    int64_t lastDimIndex = operandRank - 1;
-    int64_t lastDimSize = operandType.getDimSize(lastDimIndex);
-    int64_t lastDimResultSize =
-        hlo::isDynamicDimSize(lastDimSize)
-            ? static_cast<int64_t>(op.getK())
-            : std::min(static_cast<int64_t>(op.getK()), lastDimSize);
-    int64_t isDynamic = !operandType.hasStaticShape();
-    auto i32Type = rewriter.getIntegerType(32);
-    Value opShapeValue, resultShapeValue;
-    if (isDynamic) {
-      SmallVector<Value> sizesI32x1;
-      for (auto i = 0; i < operandType.getRank(); ++i) {
-        auto sizeI32 = rewriter.create<mhlo::GetDimensionSizeOp>(
-            op.getLoc(), op.getOperand(), i);
-        auto sizeI32x1 = rewriter.create<mhlo::ReshapeOp>(
-            op.getLoc(), RankedTensorType::get({1}, i32Type), sizeI32);
-        sizesI32x1.push_back(sizeI32x1);
-      }
-      opShapeValue =
-          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
-                                               /*dimension=*/0);
-      auto lastDimI32 = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(),
-          rewriter.getI32IntegerAttr(static_cast<int32_t>(lastDimResultSize)));
-      auto lastDimI32x1 = rewriter.create<mhlo::ReshapeOp>(
-          op.getLoc(), RankedTensorType::get({1}, i32Type), lastDimI32);
-      sizesI32x1.back() = lastDimI32x1;
-      resultShapeValue =
-          rewriter.create<mhlo::ConcatenateOp>(op.getLoc(), sizesI32x1,
-                                               /*dimension=*/0);
-    }
-
-    // Create an Iota op for indices.
-    Type iotaType = RankedTensorType::get(operandType.getShape(), i32Type);
-    Value iotaOp;
-    if (isDynamic) {
-      iotaOp = rewriter.create<mhlo::DynamicIotaOp>(
-          op.getLoc(), iotaType, opShapeValue,
-          rewriter.getI64IntegerAttr(lastDimIndex));
-    } else {
-      iotaOp = rewriter.create<mhlo::IotaOp>(
-          op.getLoc(), iotaType, rewriter.getI64IntegerAttr(lastDimIndex));
-    }
-
-    // Create the sort op. It takes two inputs, one for the original input, the
-    // other for the indices. Use TOTALORDER comparison type instead of the
-    // default comparison if the element type is of type float.
-    Type elementType = operandType.getElementType();
-    auto sortOp =
-        createSortOp(&rewriter, op.getLoc(), {op.getOperand(), iotaOp},
-                     {elementType, i32Type}, lastDimIndex,
-                     /*isStable=*/true,
-                     /*direction=*/mhlo::ComparisonDirection::GT);
-
-    // Get the sorted input and index tuple element.
-    auto tupleFirstElement = sortOp.getResult(0);
-    auto tupleSecondElement = sortOp.getResult(1);
-
-    SmallVector<int64_t, 4> beginIndices(operandRank, 0);
-    auto endIndices = llvm::to_vector<4>(operandType.getShape());
-    endIndices.back() = lastDimResultSize;
-    SmallVector<int64_t, 4> strides(operandRank, 1);
-
-    // Get the slice for the top K elements.
-    auto indicesTy = RankedTensorType::get(operandRank, rewriter.getI64Type());
-    Value values, indices;
-    if (isDynamic) {
-      Value startIndices = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get(indicesTy, beginIndices));
-      Value lastIndices = rewriter.create<mhlo::ConvertOp>(
-          op.getLoc(), resultShapeValue, rewriter.getI64Type());
-      Value stridesOp = rewriter.create<mhlo::ConstantOp>(
-          op.getLoc(), DenseIntElementsAttr::get(indicesTy, strides));
-
-      SmallVector<int64_t, 4> resultShape =
-          llvm::to_vector<4>(operandType.getShape());
-      resultShape.back() = lastDimResultSize;
-      RankedTensorType resultType = RankedTensorType::get(
-          resultShape, elementType, operandType.getEncoding());
-      RankedTensorType indexResultType =
-          RankedTensorType::get(resultShape, i32Type);
-
-      values = rewriter.create<mhlo::RealDynamicSliceOp>(
-          op.getLoc(), resultType, tupleFirstElement, startIndices, lastIndices,
-          stridesOp);
-      indices = rewriter.create<mhlo::RealDynamicSliceOp>(
-          op.getLoc(), indexResultType, tupleSecondElement, startIndices,
-          lastIndices, stridesOp);
-    } else {
-      values = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), tupleFirstElement,
-          DenseIntElementsAttr::get(indicesTy, beginIndices),
-          DenseIntElementsAttr::get(indicesTy, endIndices),
-          DenseIntElementsAttr::get(indicesTy, strides));
-      indices = rewriter.create<mhlo::SliceOp>(
-          op.getLoc(), tupleSecondElement,
-          DenseIntElementsAttr::get(indicesTy, beginIndices),
-          DenseIntElementsAttr::get(indicesTy, endIndices),
-          DenseIntElementsAttr::get(indicesTy, strides));
-    }
-
-    rewriter.replaceOp(op, {values, indices});
-    return success();
-  }
-};
-
-struct ConvertZetaOp : public OpConversionPattern<ZetaOp> {
-  using OpConversionPattern<ZetaOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ZetaOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    FloatType minPrecisionTy = rewriter.getF32Type();
-    rewriter.replaceOp(
-        op, materializeWithUpcast(rewriter, loc, adaptor.getOperands(),
-                                  minPrecisionTy, &materializeZeta));
-    return success();
-  }
-};
-
-struct ConvertSelectOp : public OpConversionPattern<BroadcastSelectOp> {
-  using OpConversionPattern<BroadcastSelectOp>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      BroadcastSelectOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only support ranked operands.
-    Value pred = adaptor.getPred();
-    Value onTrue = adaptor.getOnTrue();
-    Value onFalse = adaptor.getOnFalse();
-    auto predType = pred.getType().dyn_cast<RankedTensorType>();
-    auto onTrueType = onTrue.getType().dyn_cast<RankedTensorType>();
-    auto onFalseType = onFalse.getType().dyn_cast<RankedTensorType>();
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!predType || !onTrueType || !onFalseType || !resultType) {
-      return failure();
-    }
-
-    auto loc = op.getLoc();
-
-    Value predShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, pred);
-    Value onTrueShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, onTrue);
-    Value onFalseShape = rewriter.createOrFold<shape::ShapeOfOp>(loc, onFalse);
-    int64_t resultRank = std::max(
-        {predType.getRank(), onTrueType.getRank(), onFalseType.getRank()});
-
-    Value broadcastableCstr = rewriter.createOrFold<shape::CstrBroadcastableOp>(
-        loc, ValueRange{predShape, onTrueShape, onFalseShape});
-    auto assumingOp = rewriter.create<shape::AssumingOp>(
-        loc, ArrayRef<Type>{resultType}, broadcastableCstr);
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.createBlock(&assumingOp.getDoRegion());
-
-    Value resultExtents = rewriter.createOrFold<shape::BroadcastOp>(
-        loc, shape::getExtentTensorType(op.getContext()),
-        ValueRange{predShape, onTrueShape, onFalseShape},
-        /*error=*/nullptr);
-    auto shapeType =
-        RankedTensorType::get({resultRank}, rewriter.getIndexType());
-    resultExtents =
-        rewriter.createOrFold<tensor::CastOp>(loc, shapeType, resultExtents);
-
-    Value broadcastedPred = pred;
-    // Pred has an implicit broadcast for scalars, so use that when convenient.
-    if (predType.getRank() > 0) {
-      auto predBroadcastDimensions = llvm::to_vector<4>(
-          llvm::seq<int64_t>(resultRank - predType.getRank(), resultRank));
-      broadcastedPred = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-          loc,
-          RankedTensorType::get(resultType.getShape(),
-                                predType.getElementType()),
-          pred, resultExtents,
-          rewriter.getI64TensorAttr(predBroadcastDimensions));
-    }
-    auto onTrueBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - onTrueType.getRank(), resultRank));
-    Value broadcastedOnTrue = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(),
-                              onTrueType.getElementType()),
-        onTrue, resultExtents,
-        rewriter.getI64TensorAttr(onTrueBroadcastDimensions));
-    auto onFalseBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - onFalseType.getRank(), resultRank));
-    Value broadcastedOnFalse = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(),
-                              onFalseType.getElementType()),
-        onFalse, resultExtents,
-        rewriter.getI64TensorAttr(onFalseBroadcastDimensions));
-
-    // And generate the final non-broadcasted ternary op.
-    Value finalResult =
-        rewriter.create<mhlo::SelectOp>(loc, resultType, broadcastedPred,
-                                        broadcastedOnTrue, broadcastedOnFalse);
-    rewriter.create<shape::AssumingYieldOp>(loc, finalResult);
-    rewriter.replaceOp(op, {assumingOp.getResult(0)});
-    return success();
-  }
-};
-
-// Converts binary ops that statically are determined to not broadcast directly
-// to the corresponding mhlo non-broadcasting op.
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhsType =
-        adaptor.getLhs().getType().template dyn_cast<RankedTensorType>();
-    auto rhsType =
-        adaptor.getRhs().getType().template dyn_cast<RankedTensorType>();
-    if (!lhsType || !rhsType) return failure();
-
-    // Requires rank broadcast.
-    if (lhsType.getRank() != rhsType.getRank()) return failure();
-    // Any dynamic dimension may require broadcasting and requires more
-    // analysis.
-    if (!lhsType.hasStaticShape() || !rhsType.hasStaticShape())
-      return failure();
-
-    for (auto extents : llvm::zip(lhsType.getShape(), rhsType.getShape())) {
-      auto lhsExtent = std::get<0>(extents);
-      auto rhsExtent = std::get<1>(extents);
-      if (lhsExtent != rhsExtent) {
-        return failure();
-      }
-    }
-
-    rewriter.replaceOp(op, Adaptor::createOp(op, op.getResult().getType(),
-                                             adaptor.getOperands(), rewriter));
-    return success();
-  }
-};
-
-// Converts a binary op with ranked broadcasting operands to explicitly
-// broadcast and invoke the corresponding mhlo non-broadcasting op.
-// Note that dynamic broadcasting supported by this pattern is only valid for
-// "numpy" broadcasting semantics as defined here:
-//   https://docs.scipy.org/doc/numpy/reference/ufuncs.html
-// Specifically, this includes the following cases:
-//   - Same rank broadcast (operands have the same static rank).
-//   - Different-rank broadcast, either without a broadcast_dims attribte or
-//     with the broadcast_dims attribute set to map to a prefix padding.
-//   - Legal combinations of degenerate (1-dim) implicit broadcasting.
-// The restriction on broadcast_dims derives from the definition of the
-// `shape.broadcast` op, which only supports prefix-padding.
-template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, typename ChloOpTy::Adaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    // Only support ranked operands.
-    Value lhs = adaptor.getLhs();
-    Value rhs = adaptor.getRhs();
-    auto lhsType = lhs.getType().dyn_cast<RankedTensorType>();
-    auto rhsType = rhs.getType().dyn_cast<RankedTensorType>();
-    auto resultType =
-        op.getResult().getType().template dyn_cast<RankedTensorType>();
-    if (!lhsType || !rhsType || !resultType) return failure();
-
-    // Check for "numpy"-style rank broadcast.
-    auto broadcastDimensions = op.getBroadcastDimensions();
-    if (broadcastDimensions &&
-        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions)) {
-      // Note: It is unclear whether the general specification of explicit
-      // broadcast_dimensions on binary ops is a feature we want to carry
-      // forward. While it can technically be implemented for ranked-dynamic,
-      // it is incompatible with unranked inputs. If this warning is emitted
-      // in real programs, it is an indication that the feature should be
-      // implemented versus just falling back on the more standard definition
-      // of numpy-like prefix-padding.
-      op.emitWarning() << "unsupported non prefix-padded dynamic rank "
-                       << "broadcast_dimensions = " << *broadcastDimensions;
-      return failure();
-    }
-
-    // Compute result shape.
-    auto loc = op.getLoc();
-
-    // Insert a constraint on the shapes being broadcastable and insert all
-    // future code into an assuming block reliant on the constraint.
-    Value lhsShape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
-    Value rhsShape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
-    auto broadcastableCstr =
-        rewriter.create<shape::CstrBroadcastableOp>(loc, lhsShape, rhsShape);
-    auto assumingOp = rewriter.create<shape::AssumingOp>(
-        loc, ArrayRef<Type>{resultType}, broadcastableCstr.getResult());
-
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.createBlock(&assumingOp.getDoRegion());
-
-    int64_t resultRank = std::max(lhsType.getRank(), rhsType.getRank());
-    Value resultExtents =
-        hlo::computeBinaryElementwiseBroadcastingResultExtents(loc, lhs, rhs,
-                                                               rewriter);
-
-    // Note that we unconditionally emit DynamicBroadcastInDim ops and let
-    // downstream canonicalizations fold them away if possible. This is
-    // because, in the dynamic case, there are many corner cases regarding
-    // when it is safe to omit, and some of them require analysis to prove
-    // properly.
-    auto lhsBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - lhsType.getRank(), resultRank));
-    Value broadcastedLhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(), lhsType.getElementType()),
-        lhs, resultExtents, rewriter.getI64TensorAttr(lhsBroadcastDimensions));
-    auto rhsBroadcastDimensions = llvm::to_vector<4>(
-        llvm::seq<int64_t>(resultRank - rhsType.getRank(), resultRank));
-    Value broadcastedRhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
-        loc,
-        RankedTensorType::get(resultType.getShape(), rhsType.getElementType()),
-        rhs, resultExtents, rewriter.getI64TensorAttr(rhsBroadcastDimensions));
-
-    // And generate the final non-broadcasted binary op.
-    Value finalResult = Adaptor::createOp(
-        op, resultType, {broadcastedLhs, broadcastedRhs}, rewriter);
-    rewriter.create<shape::AssumingYieldOp>(loc, finalResult);
-    rewriter.replaceOp(op, {assumingOp.getResult(0)});
-    return success();
-  }
-};
-
-class ConvertDynamicReshapeOp
-    : public OpRewritePattern<chlo::DynamicReshapeOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(chlo::DynamicReshapeOp op,
-                                PatternRewriter &rewriter) const override {
-    auto loc = op.getLoc();
-    auto tensor = op.getOperand();
-    auto shape = op.getOutputShape();
-
-    auto shapeTy = shape.getType().cast<ShapedType>();
-    auto resultTy = op.getType().cast<ShapedType>();
-
-    Value inputShape = rewriter.create<shape::ShapeOfOp>(loc, tensor);
-    Value numEls = rewriter.create<shape::NumElementsOp>(loc, inputShape);
-    Value cstr = rewriter.create<mhlo::CstrReshapableOp>(loc, numEls, shape);
-    rewriter.replaceOpWithNewOp<shape::AssumingOp>(
-        op, cstr, [&](OpBuilder &b, Location l) {
-          Value computedShape =
-              b.create<mhlo::ComputeReshapeShapeOp>(l, shapeTy, numEls, shape);
-          SmallVector<Value> result;
-          result.push_back(b.create<mhlo::DynamicReshapeOp>(l, resultTy, tensor,
-                                                            computedShape));
-          return result;
-        });
-
-    return success();
-  }
-};
-
-#include "chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc"
-}  // namespace
-
-void populateChloBroadcastingPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns) {
-  // Instantiate conversion templates for conforming binary elementwise ops
-  // that do not have different dtypes between operands and results and do
-  // not have special attributes that need to be preserved.
-  populateForBroadcastingBinaryOp<ConvertTrivialNonBroadcastBinaryOp>(
-      context, patterns, 10);
-  populateForBroadcastingBinaryOp<ConvertRankedDynamicBroadcastBinaryOp>(
-      context, patterns, 5);
-  patterns
-      ->add<ConvertConstantLikeOp, ConvertDynamicReshapeOp, ConvertSelectOp>(
-          context);
-}
-
-void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns) {
-  // Patterns that decompose to a basis set of HLOs
-  // These are guaranteed to be convertible to StableHLO, but discard some
-  // higher level information that is useful to XLA compilation.
-  patterns->add<BasisConvertErfOp, BasisConvertTopKOp>(context);
-}
-
-void populateDecomposeChloPatterns(MLIRContext *context,
-                                   RewritePatternSet *patterns) {
-  populateWithGenerated(*patterns);
-
-  // Other patterns.
-  // clang-format off
-  patterns->add<ConvertBesselI1eOp,
-                   ConvertCoshOp,
-                   ConvertDigammaOp,
-                   ConvertErfcOp,
-                   ConvertErfInvOp,
-                   ConvertLgammaOp,
-                   ConvertNextAfterOp,
-                   ConvertPolygammaOp,
-                   ConvertSinhOp,
-                   ConvertZetaOp>(context);
-  // clang-format on
-}
-
-}  // namespace chlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 058e6db42fc321..d03c05880865e4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -19,59 +19,46 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
+#include "mhlo/utils/type_conversion.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Pass/Pass.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/PassManager.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/ChloOps.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "stablehlo/transforms/Passes.h"
 
 namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CHLOLEGALIZETOHLOPASS
-#define GEN_PASS_DEF_CHLOLEGALIZETOHLOBASISOPSPASS
+#define GEN_PASS_DEF_CHLOLEGALIZETOHIGHLEVELMHLOPASS
 #include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
 
-struct ChloLegalizeToHloPass
-    : public impl::ChloLegalizeToHloPassBase<ChloLegalizeToHloPass> {
-  explicit ChloLegalizeToHloPass(bool legalizeBroadcasts,
-                                 bool expandCompositions)
-      : ChloLegalizeToHloPassBase<
-            ChloLegalizeToHloPass>::ChloLegalizeToHloPassBase() {
-    this->legalize_broadcasts_ = legalizeBroadcasts;
-    this->expand_compositions_ = expandCompositions;
-  }
+struct ChloLegalizeToHighLevelMhloPass
+    : public impl::ChloLegalizeToHighLevelMhloPassBase<
+          ChloLegalizeToHighLevelMhloPass> {
+  using ChloLegalizeToHighLevelMhloPassBase::
+      ChloLegalizeToHighLevelMhloPassBase;
 
   void runOnOperation() override {
-    ConversionTarget conversionTarget(getContext());
-    RewritePatternSet conversionPatterns(&getContext());
-    conversionTarget.addIllegalDialect<chlo::ChloDialect>();
+    MLIRContext &context = getContext();
+    ConversionTarget conversionTarget(context);
+    RewritePatternSet conversionPatterns(&context);
+
+    chlo::populateChloToHighLevelMhloOpPatterns(&context, &conversionPatterns);
 
     // Consider the mhlo dialect legal for tests. Also add helper dialects
     // that are needed by the patterns.
-    conversionTarget
-        .addLegalDialect<MhloDialect, mlir::arith::ArithDialect,
-                         mlir::func::FuncDialect, mlir::tensor::TensorDialect,
-                         mlir::shape::ShapeDialect, mlir::scf::SCFDialect>();
-    conversionTarget.addLegalOp<chlo::MinimumBroadcastShapesOp>();
-
-    if (legalize_broadcasts_) {
-      chlo::populateChloBroadcastingPatterns(&getContext(),
-                                             &conversionPatterns);
-    }
-
-    if (expand_compositions_) {
-      chlo::populateDecomposeChloPatterns(&getContext(), &conversionPatterns);
-    } else {
-      conversionTarget
-          .addLegalOp<chlo::NextAfterOp, chlo::PolygammaOp, chlo::ZetaOp>();
-    }
+    conversionTarget.addLegalDialect<chlo::ChloDialect, mhlo::MhloDialect>();
+    conversionTarget.addIllegalOp<chlo::TopKOp, chlo::ErfOp, chlo::TanOp>();
 
     if (failed(applyPartialConversion(getOperation(), conversionTarget,
                                       std::move(conversionPatterns)))) {
@@ -80,29 +67,27 @@ struct ChloLegalizeToHloPass
   }
 };
 
-struct ChloLegalizeToHloBasisOpsPass
-    : public impl::ChloLegalizeToHloBasisOpsPassBase<
-          ChloLegalizeToHloBasisOpsPass> {
-  using ChloLegalizeToHloBasisOpsPassBase::ChloLegalizeToHloBasisOpsPassBase;
+struct ChloLegalizeToHloPass
+    : public impl::ChloLegalizeToHloPassBase<ChloLegalizeToHloPass> {
+  using ChloLegalizeToHloPassBase::ChloLegalizeToHloPassBase;
 
   void runOnOperation() override {
-    ConversionTarget conversionTarget(getContext());
-    RewritePatternSet conversionPatterns(&getContext());
+    MLIRContext &context = getContext();
+    ConversionTarget conversionTarget(context);
+    RewritePatternSet conversionPatterns(&context);
 
-    // Patterns will only be applied to these ops
-    conversionTarget.addIllegalOp<chlo::ErfOp, chlo::TopKOp>();
+    stablehlo::StablehloToHloTypeConverter typeConverter;
+    chlo::populateChloToHloPatterns(&context, &typeConverter,
+                                    &conversionPatterns);
 
-    // Programs with MHLO equivalents to the StableHLO ops are likely bugs
-    // for users of this expander pass, so best to disallow.
-    conversionTarget.addIllegalOp<mhlo::TopKOp>();  // TODO: Add ErfOp
-
-    // Given that the resulting patterns should be convertible to StableHLO
-    // Only MHLO should be legal.
+    // Consider the mhlo dialect legal for tests. Also add helper dialects
+    // that are needed by the patterns.
     conversionTarget
-        .addLegalDialect<MhloDialect, chlo::ChloDialect, func::FuncDialect>();
-
-    chlo::populateChloLegalizeToHloBasisOpsPatterns(&getContext(),
-                                                    &conversionPatterns);
+        .addIllegalDialect<chlo::ChloDialect, stablehlo::StablehloDialect>();
+    conversionTarget.addLegalDialect<
+        MhloDialect, mlir::arith::ArithDialect, mlir::func::FuncDialect,
+        mlir::tensor::TensorDialect, mlir::shape::ShapeDialect>();
+    conversionTarget.addLegalOp<chlo::MinimumBroadcastShapesOp>();
 
     if (failed(applyPartialConversion(getOperation(), conversionTarget,
                                       std::move(conversionPatterns)))) {
@@ -113,16 +98,26 @@ struct ChloLegalizeToHloBasisOpsPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
-    bool legalizeBroadcasts, bool expandCompositions) {
-  return std::make_unique<ChloLegalizeToHloPass>(legalizeBroadcasts,
-                                                 expandCompositions);
+}  // namespace mhlo
+
+namespace chlo {
+namespace {
+#include "chlo_legalize_to_hlo/generated_chlo_legalize_to_hlo.inc"
+
+}  // namespace
+
+void populateChloToHighLevelMhloOpPatterns(MLIRContext *,
+                                           RewritePatternSet *patterns) {
+  populateWithGenerated(*patterns);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>>
-createChloLegalizeToHloBasisOpsPass() {
-  return std::make_unique<ChloLegalizeToHloBasisOpsPass>();
+void populateChloToHloPatterns(MLIRContext *context,
+                               TypeConverter *typeConverter,
+                               RewritePatternSet *patterns) {
+  chlo::populateChloToHighLevelMhloOpPatterns(context, patterns);
+  stablehlo::populateChloToStablehloPatterns(context, patterns);
+  stablehlo::populateStablehloToHloPatterns(patterns, typeConverter, context);
 }
 
-}  // namespace mhlo
+}  // namespace chlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
index 937739f2851454..497686bf2e2ab8 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
@@ -19,345 +19,22 @@ limitations under the License.
 // ambiguous/different for various backends. Avoid patterns that are actually
 // lowering to non-canonical forms.
 
-include "mlir/Dialect/Shape/IR/ShapeOps.td"
 include "mlir/IR/OpBase.td"
 include "mhlo/IR/hlo_ops.td"
 include "stablehlo/dialect/ChloOps.td"
 
-class MHLO_ComparisonDirectionValue<string enumStr> :
-  ConstantAttr<MHLO_ComparisonDirectionAttr, "::mlir::mhlo::ComparisonDirection::" # enumStr>;
-
 //===----------------------------------------------------------------------===//
-// Unary op patterns.
+// Direct CHLO->MHLO conversions
 //===----------------------------------------------------------------------===//
 
-// Expand acos for non-complex arguments to MHLO dialect as follows:
-//   acos(x) = 2 * atan2(sqrt(1 - x^2), (1 + x))  if x != -1
-//           = pi                                 if x == -1
-//
-// TODO(b/237376133): Support operands with complex element types separately
-// using the following formula.
-//   acos(x) = -(i * log(x + i * sqrt((1 + x) * (1 - x))))
-def : Pat<(CHLO_AcosOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      $input,
-      (MHLO_ConstantLike<"-1"> $input),
-      MHLO_ComparisonDirectionValue<"NE">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_MulOp
-      (MHLO_ConstantLike<"2"> $input),
-      (MHLO_Atan2Op
-        (MHLO_SqrtOp
-          (MHLO_SubtractOp
-            (MHLO_ConstantLike<"1"> $input),
-            (MHLO_MulOp $input, $input)
-          )
-        ),
-        (MHLO_AddOp
-          (MHLO_ConstantLike<"1"> $input),
-          $input
-        )
-      )
-    ),
-    (MHLO_ConstantLike<"M_PI"> $input)
-  )>;
-
-// Expand acosh to MHLO dialect as follows:
-//   acosh(x) = log(x + sqrt(x^2 - 1))      if x >= -1
-//            = log(x + sqrt((x+1)*(x-1)))
-//   acosh(x) = nan                         if x < -1
-//
-// If x^2 will overflow, we approximate sqrt(x^2 - 1) == x and compute as
-// log(2*x) = log(2) + log(x).  (Note this works because negative x never
-// overflows; x < -1 simply yields nan.
-def : Pat<(CHLO_AcoshOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      $input,
-      (MHLO_ConstantLike<"-1"> $input),
-      MHLO_ComparisonDirectionValue<"LT">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_ConstantLike<"NAN"> $input),
-    (MHLO_SelectOp
-      (MHLO_CompareOp
-        $input,
-        (MHLO_SqrtOp
-          (MHLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        MHLO_ComparisonDirectionValue<"GE">,
-        (MHLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (MHLO_AddOp
-        (MHLO_LogOp $input),
-        (MHLO_LogOp
-          (MHLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (MHLO_LogOp
-        (MHLO_AddOp
-          $input,
-          (MHLO_SqrtOp
-            (MHLO_MulOp
-              (MHLO_AddOp
-                (MHLO_ConstantLike<"1"> $input),
-                $input
-              ),
-              (MHLO_AddOp
-                (MHLO_ConstantLike<"-1"> $input),
-                $input
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand acosh for complex arguments to MHLO dialect as
-//   acosh(x) = log(x + sqrt((x+1)*(x-1)))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AcoshOp ComplexElementType:$input),
-  (MHLO_LogOp
-    (MHLO_AddOp
-      $input,
-      (MHLO_SqrtOp
-        (MHLO_MulOp
-          (MHLO_AddOp
-            $input,
-            (MHLO_ConstantLike<"1"> $input)
-          ),
-          (MHLO_SubtractOp
-            $input,
-            (MHLO_ConstantLike<"1"> $input)
-          )
-        )
-      )
-    )
-  )>;
-
-
-// Expand asin to MHLO dialect as follows:
-//   asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
-def : Pat<(CHLO_AsinOp $input),
-  (MHLO_MulOp
-    (MHLO_ConstantLike<"2"> $input),
-    (MHLO_Atan2Op
-      $input,
-      (MHLO_AddOp
-        (MHLO_ConstantLike<"1"> $input),
-        (MHLO_SqrtOp
-          (MHLO_SubtractOp
-            (MHLO_ConstantLike<"1"> $input),
-            (MHLO_MulOp $input, $input)
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for non-complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// If x^2 will overflow and x is positive, we can approximate x + sqrt(x^2 + 1)
-// as 2*x and return log(2) + log(x).
-//
-// For small x, sqrt(x^2 + 1) will evaluate to 1 due to floating point
-// arithmetic. However, we would like to retain the low order term of this,
-// which is around 0.5 * x^2 using a binomial expansion.
-// Let z = sqrt(a^2 + 1)
-// The following rewrite retains the lower order term.
-// log(a + sqrt(a^2 + 1))
-//   = log((a + sqrt(a^2 + 1)) * (1 + sqrt(a^2 + 1)) / (1 + sqrt(a^2 + 1)))
-//   = log((a + a^2 + 1 + a * z + z) / (1 + z))
-//   = log(1 + a + a^2 / (1 + z))
-//   = log(1 + a + a^2 / (1 + sqrt(a^2 + 1)))
-//
-// If x is negative, the above would give us some trouble; we can't approximate
-// the result as x + abs(x) = 0 but we are saved by the fact that asinh(-x) =
-// -asinh(x).
-def : Pat<(CHLO_AsinhOp NonComplexElementType:$input),
-  (MHLO_MulOp
-    (MHLO_SignOp $input),
-    (MHLO_SelectOp
-      (MHLO_CompareOp
-        (MHLO_AbsOp $input),
-        (MHLO_SqrtOp
-          (MHLO_ConstantLikeMaxFiniteValue $input)
-        ),
-        MHLO_ComparisonDirectionValue<"GE">,
-        (MHLO_DEFAULT_COMPARISON_TYPE)
-      ),
-      (MHLO_AddOp
-        (MHLO_LogOp
-          (MHLO_AbsOp $input)
-        ),
-        (MHLO_LogOp
-          (MHLO_ConstantLike<"2"> $input)
-        )
-      ),
-      (MHLO_SelectOp
-        (MHLO_CompareOp
-          (MHLO_AbsOp $input),
-          (MHLO_ConstantLike<"1"> $input),
-          MHLO_ComparisonDirectionValue<"LE">,
-          (MHLO_DEFAULT_COMPARISON_TYPE)
-        ),
-        (MHLO_Log1pOp
-          (MHLO_AddOp
-            (MHLO_AbsOp $input),
-            (MHLO_MulOp
-              (MHLO_AbsOp $input),
-              (MHLO_DivOp
-                (MHLO_AbsOp $input),
-                (MHLO_AddOp
-                  (MHLO_ConstantLike<"1"> $input),
-                  (MHLO_SqrtOp
-                    (MHLO_AddOp
-                      (MHLO_MulOp
-                        (MHLO_AbsOp $input),
-                        (MHLO_AbsOp $input)
-                      ),
-                      (MHLO_ConstantLike<"1"> $input)
-                    )
-                  )
-                )
-              )
-            )
-          )
-        ),
-        (MHLO_LogOp
-          (MHLO_AddOp
-            (MHLO_AbsOp $input),
-            (MHLO_SqrtOp
-              (MHLO_AddOp
-                (MHLO_MulOp
-                  (MHLO_AbsOp $input),
-                  (MHLO_AbsOp $input)
-                ),
-                (MHLO_ConstantLike<"1"> $input)
-              )
-            )
-          )
-        )
-      )
-    )
-  )>;
-
-// Expand asinh for complex arguments to MHLO dialect as
-//   asinh(x) = log(x + sqrt(x^2 + 1))
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the question of overflow if x is a
-// complex type, because we don't yet have exhaustive tests for complex trig
-// functions".
-def : Pat<(CHLO_AsinhOp ComplexElementType:$input),
-  (MHLO_LogOp
-    (MHLO_AddOp
-      $input,
-      (MHLO_SqrtOp
-        (MHLO_AddOp
-          (MHLO_MulOp $input, $input),
-          (MHLO_ConstantLike<"1"> $input)
-        )
-      )
-    )
-  )>;
-
-// Express `atan` as
-//   atan(x) = atan2(x, 1)
-def : Pat<(CHLO_AtanOp $input),
-  (MHLO_Atan2Op
-    $input,
-    (MHLO_ConstantLike<"1"> $input)
-  )>;
-
-// Express `atanh` for non-complex arguments as follows:
-//   atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) <= 1
-//   atanh(x) = nan                          otherwise
-def : Pat<(CHLO_AtanhOp NonComplexElementType:$input),
-  (MHLO_SelectOp
-    (MHLO_CompareOp
-      (MHLO_AbsOp $input),
-      (MHLO_ConstantLike<"1"> $input),
-      MHLO_ComparisonDirectionValue<"GT">,
-      (MHLO_DEFAULT_COMPARISON_TYPE)
-    ),
-    (MHLO_ConstantLike<"NAN"> $input),
-    (MHLO_MulOp
-      (MHLO_SubtractOp
-        (MHLO_Log1pOp $input),
-        (MHLO_Log1pOp
-          (MHLO_NegOp $input)
-        )
-      ),
-      (MHLO_ConstantLike<"0.5"> $input)
-    )
-  )>;
-
-// Express `atanh` for complex arguments as follows:
-//   atanh(x) = (log(1 + x) - log(1 + (-x))) * 0.5
-//
-// Per tensorflow/compiler/xla/client/lib/math.cc at the time of writing:
-// "For now, we ignore the nan edge case for complex inputs,
-// because we don't yet have exhaustive tests for complex trig functions".
-def : Pat<(CHLO_AtanhOp ComplexElementType:$input),
-  (MHLO_MulOp
-    (MHLO_SubtractOp
-      (MHLO_Log1pOp $input),
-      (MHLO_Log1pOp
-        (MHLO_NegOp $input)
-      )
-    ),
-    (MHLO_ConstantLike<"0.5"> $input)
-  )>;
-
-// Express `conj` as
-//   conj(x) = (re(x), -im(x)).
-def : Pat<(CHLO_ConjOp $v),
-          (MHLO_ComplexOp (MHLO_RealOp $v), (MHLO_NegOp (MHLO_ImagOp $v)))>;
-
-// Express `is_inf` as
-//   is_inf(x) = is_pos_inf(|x|)
-def : Pat<(CHLO_IsInfOp NonComplexElementType:$input),
-  (CHLO_IsPosInfOp
-    (MHLO_AbsOp $input)
-  )>;
-
-// Express `is_pos_inf` as
-//   is_pos_inf(x) = (x == +inf)
-def : Pat<(CHLO_IsPosInfOp NonComplexElementType:$input),
-  (MHLO_CompareOp
-    $input,
-    (MHLO_ConstantLikePosInfValue $input),
-    MHLO_ComparisonDirectionValue<"EQ">,
-    (MHLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-// Express `is_neg_inf` as
-//   is_neg_inf(x) = (x == -inf)
-def : Pat<(CHLO_IsNegInfOp NonComplexElementType:$input),
-  (MHLO_CompareOp
-    $input,
-    (MHLO_ConstantLikeNegInfValue $input),
-    MHLO_ComparisonDirectionValue<"EQ">,
-    (MHLO_DEFAULT_COMPARISON_TYPE)
-  )>;
-
-def : Pat<(CHLO_ConstantOp $v),
-          (MHLO_ConstantOp $v)>;
-
 def : Pat<(CHLO_TanOp $v),
-          (MHLO_TanOp $v)>;
+          (MHLO_TanOp $v),
+          [], [], (addBenefit 10)>;
 
 def : Pat<(CHLO_ErfOp $v),
-          (MHLO_ErfOp $v)>;
+          (MHLO_ErfOp $v),
+          [], [], (addBenefit 10)>;
 
 def : Pat<(CHLO_TopKOp AnyRankedTensor:$v, $k),
-          (MHLO_TopKOp $v, $k, ConstBoolAttrTrue)>;
+          (MHLO_TopKOp $v, $k, ConstBoolAttrTrue),
+          [], [], (addBenefit 10)>;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index 5531d568a8d179..62868358ffc0e0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -15,29 +15,28 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
-  let summary = "Legalize CHLO to HLO.";
-  let constructor = "createChloLegalizeToHloPass()";
-  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect",
-                           "shape::ShapeDialect", "scf::SCFDialect"];
-  let options = [
-    Option<"legalize_broadcasts_", "legalize-broadcasts", "bool",
-           /*default=*/"true", "Legalize implicit broadcasts to explicit HLO broadcasting forms">,
-    Option<"expand_compositions_", "expand-compositions", "bool",
-           /*default=*/"true", "Expands client-centric compositions to HLO primitives">,
-  ];
+def ChloLegalizeToHighLevelMhloPass : Pass<"chlo-legalize-to-high-level-mhlo", "func::FuncOp"> {
+  let summary = "Legalize CHLO's with XLA counterparts, like TopK and Erf.";
+  let description = [{
+    Performs direct legalization of CHLO->MHLO only for high-level (non-basis)
+    ops with XLA support. These are MHLO ops that directly model the CHLO op,
+    such as TopK and Erf.
+  }];
+  let dependentDialects = ["mhlo::MhloDialect"];
 }
 
-def ChloLegalizeToHloBasisOpsPass : Pass<"chlo-legalize-to-hlo-basis-ops", "func::FuncOp"> {
-  let summary = "Legalize specific CHLO ops (e.g. ErfOf and TopKOp) to basis MHLO ops.";
+def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
+  let summary = "Legalize CHLO to MHLO with XLA-supported ops.";
   let description = [{
-    XLA has specialization for certain CHLO ops (ErfOp, TopKOp), and other
-    backends still require decomposition of these ops into the basis set which
-    can be converted safely to StableHLO. This pass is needed until we have
-    direct CHLO to StableHLO lowerings.
+    Performs legalization of CHLO->StableHLO->MHLO, while also preserving MHLO
+    high level operations when possible (see ChloLegalizeToHighLevelMhloPass).
   }];
-  let constructor = "createChloLegalizeToHloBasisOpsPass()";
-  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect"];
+  let dependentDialects = [
+    "mhlo::MhloDialect",
+    "mlir::shape::ShapeDialect",
+    "mlir::stablehlo::StablehloDialect",
+    "mlir::tensor::TensorDialect"
+  ];
 }
 
 def HloCanonicalizeScatterPass : Pass<"hlo-canonicalize-scatter", "func::FuncOp"> {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
index 145713148fa653..d0da0f12e098fb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_quant_legalize_to_int/mhlo_quant_legalize_to_int.cc
@@ -1040,10 +1040,9 @@ FailureOr<bool> isDotLikeOpHybrid(DotLikeOp op) {
       getElementTypeOrSelf(op.getResult()));
 
   if (isLhsQuant && ((isRhsQuant && isResQuant) ||
-                     (isa<mhlo::ConvolutionOp>(op) && isRhsQuantPerChannel &&
-                      isResQuantPerChannel))) {
-    // For quantized ops, RHS and result must be both per-channel quantized.
-    // For Convolution, we also support per-channel quantized RHS/result.
+                     (isRhsQuantPerChannel && isResQuantPerChannel))) {
+    // For quantized ops, RHS and result must be both per-channel quantized or
+    // both per-tensor quantized.
     return false;
   }
   if (!isLhsQuant && !isLhsQuantPerChannel && isRhsQuant && !isResQuant &&
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index a52c5b4ee7b2a4..b9a025cdb55861 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -49,14 +49,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSortPass();
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToStdPass();
 
-/// Lowers from the CHLO dialect to the HLO dialect.
-std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
-    bool legalizeBroadcasts = true, bool expandCompositions = true);
-
-/// Lowers specific ops from the CHLO dialect to an HLO basis opset
-std::unique_ptr<OperationPass<func::FuncOp>>
-createChloLegalizeToHloBasisOpsPass();
-
 // Lowers from sparse ops in CHLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSparseOperationsPass(
     bool legalizeToCustomCalls = true);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index 4de69f27d268e0..c40a087ab52cbd 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -166,13 +166,6 @@ void populateGroupReductionDimensionsPatterns(MLIRContext *context,
                                               RewritePatternSet *patterns,
                                               bool preferColumnsReductions);
 
-/// Populate rank specialization clustering and lowering patterns.
-void populateRankSpecializationClusterPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns);
-void populateRankSpecializationToSCFPatterns(MLIRContext *context,
-                                             RewritePatternSet *patterns,
-                                             int64_t maxTargetRank);
-
 /// Populate sparse tensor specific rewriting patterns.
 void populateSparseRewritingPatterns(RewritePatternSet *patterns,
                                      MLIRContext *ctx);
@@ -191,23 +184,16 @@ void populateLegalizeSparseOpsToCustomCallPatterns(MLIRContext *context,
 
 namespace chlo {
 
-// Populates a collection of conversion patterns for legalizing broadcasting
-// client-HLO to their non-broadcasting counterparts.
-void populateChloBroadcastingPatterns(MLIRContext *context,
-                                      RewritePatternSet *patterns);
+// Populates direct translations between CHLO and MHLO ops for higher level
+// MHLO ops like TopK and Erf.
+void populateChloToHighLevelMhloOpPatterns(MLIRContext *context,
+                                           RewritePatternSet *patterns);
 
-// Populates a collection of conversion patterns for legalizing client-HLO to
-// HLO by decomposing client-operations to corresponding sequences of more
-// primitive operations. This does not include the
-// PopulateChloBroadcastingPatterns above.
-void populateDecomposeChloPatterns(MLIRContext *context,
-                                   RewritePatternSet *patterns);
-
-// Adds pattern to decompose specific CHLO ops like ErfOp and TopKOp to their
-// basis set of operations. These ops have 1:1 corresponding MHLO ops, but for
-// certain backends, they need to be expanded.
-void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns);
+// Populates direct translations between CHLO->MHLO high level ops
+// and CHLO->StableHLO->MHLO patterns.
+void populateChloToHloPatterns(MLIRContext *context,
+                               TypeConverter *typeConverter,
+                               RewritePatternSet *patterns);
 
 }  // namespace chlo
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index 4db81efeb2dd52..30c6301afe61ee 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -26,7 +26,9 @@ package(
         tags = [
             "nomsan",  # The execution engine doesn't work with msan, see b/248097619.
         ],
-        deps = ["@pypi_lit//:pkg"],
+        #Build failing due to the double deps definition. Should be resolved by next weekly thanks to 
+        #https://github.com/llvm/llvm-project/commit/dd5797505ebc2dbfdd58927c4f0a11a1256696eb
+        #deps = ["@pypi_lit//:pkg"],
     )
     for src in glob(["**/*.mlir"])
 ]
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
deleted file mode 100644
index 512e1cea6ff752..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
+++ /dev/null
@@ -1,345 +0,0 @@
-// RUN: mlir-hlo-opt -chlo-legalize-to-hlo="legalize-broadcasts=true expand-compositions=false" -cse -canonicalize -split-input-file -verify-diagnostics %s -o - | FileCheck %s
-
-// Check the non-broadcast case for each registered op, then just check a
-// representative op for detailed broadcast semantics.
-// CHECK-LABEL: @addWithoutBroadcast
-func.func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.add %arg0, %arg1
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcast
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-DAG:    %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.add %[[ARG0_B]], %[[ARG1_B]]
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT:      return %[[FINAL_RESULT]] : tensor<?x?xf32>
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  func.return %0 : tensor<?x?xf32>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcastComplex
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-NEXT: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK-NEXT:   %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG:    %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG:    %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT:   %[[RESULT:.+]] = mhlo.complex %[[ARG0_B]], %[[ARG1_B]] : tensor<?x?xcomplex<f32>>
-  // CHECK-NEXT:   shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[FINAL_RESULT]] : tensor<?x?xcomplex<f32>>
-  %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>>
-  func.return %0 : tensor<?x?xcomplex<f32>>
-}
-
-// -----
-// CHECK-LABEL: @dynamicBroadcastCompare
-// CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
-// CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
-func.func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-  // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK: %[[FINAL_RESULT:.+]] = shape.assuming %[[WITNESS]]
-  // CHECK: %[[RESULT_EXTENTS:.+]] = shape.broadcast %[[ARG0_S]], %[[ARG1_S]]
-  // CHECK-DAG: %[[ARG0_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-DAG: %[[ARG1_B:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[ARG1]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.+]] = mhlo.compare EQ, %[[ARG0_B]], %[[ARG1_B]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  // CHECK: shape.assuming_yield %[[RESULT]]
-  // CHECK-NEXT: }
-  // CHECK: return %[[FINAL_RESULT]] : tensor<?x?xi1>
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  func.return %0 : tensor<?x?xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func @selectv2
-func.func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @selectv2_pred_scalar
-func.func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  func.return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_then
-func.func @selectv2_broadcast_then(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  // CHECK-NEXT: mhlo.select %arg0, %[[BROADCAST]], %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_else
-func.func @selectv2_broadcast_else(%arg0: tensor<i1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<8x1xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  // CHECK-NEXT: mhlo.select %arg0, %arg1, %[[BROADCAST]]
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x8x8xi32>, tensor<8x1xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_pred
-func.func @selectv2_broadcast_pred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1xi1>) -> tensor<2x8x8xi1>
-  // CHECK-NEXT: mhlo.select %[[BROADCAST]], %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
-  func.return %0: tensor<2x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_tensor_pred
-func.func @selectv2_broadcast_tensor_pred(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
-  // CHECK-NEXT: %[[BROADCAST:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi1>) -> tensor<2x3xi1>
-  // CHECK-NEXT: mhlo.select %[[BROADCAST]], %arg1, %arg2
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
-  func.return %0: tensor<2x3xf16>
-}
-
-// CHECK-LABEL: func @selectv2_broadcast_all
-func.func @selectv2_broadcast_all(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
-  // CHECK-DAG: %[[BROADCAST_0:.*]] = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x1x1xi1>) -> tensor<8x8x8xi1>
-  // CHECK-DAG: %[[BROADCAST_1:.*]] = "mhlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x8x1xi32>) -> tensor<8x8x8xi32>
-  // CHECK-DAG: %[[BROADCAST_2:.*]] = "mhlo.broadcast_in_dim"(%arg2) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
-  // CHECK: mhlo.select %[[BROADCAST_0]], %[[BROADCAST_1]], %[[BROADCAST_2]]
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<8x1x1xi1>, tensor<1x8x1xi32>, tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
-  func.return %0: tensor<8x8x8xi32>
-}
-
-// CHECK-LABEL: func @selectv2_dynamic_ranked
-func.func @selectv2_dynamic_ranked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x?x8xi32> {
-  // CHECK-DAG: %[[SHAPE0:.*]] = shape.const_shape [1] : tensor<1xindex>
-  // CHECK-DAG: %[[SHAPE2:.*]] = shape.const_shape [2, 8, 8] : tensor<3xindex>
-  // CHECK-NEXT: %[[SHAPE1:.*]] = shape.shape_of %arg1 : tensor<2x?x8xi32> -> tensor<3xindex>
-  // CHECK-NEXT: %[[CSTR:.*]] = shape.cstr_broadcastable %[[SHAPE1]], %[[SHAPE0]], %[[SHAPE2]] : tensor<3xindex>, tensor<1xindex>, tensor<3xindex>
-  // CHECK-NEXT: %[[ASSUME:.*]] = shape.assuming %[[CSTR]] -> (tensor<2x?x8xi32>) {
-  // CHECK-NEXT:   %[[BCST:.*]] = shape.broadcast %[[SHAPE1]], %[[SHAPE2]] : tensor<3xindex>, tensor<3xindex> -> tensor<3xindex>
-  // CHECK-NEXT:   %[[BCST0:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg0, %[[BCST]]) {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1xi1>, tensor<3xindex>) -> tensor<2x?x8xi1>
-  // CHECK-NEXT:   %[[BCST1:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[BCST]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x?x8xi32>, tensor<3xindex>) -> tensor<2x?x8xi32>
-  // CHECK-NEXT:   %[[BCST2:.*]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, %[[BCST]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<2x8x8xi32>, tensor<3xindex>) -> tensor<2x?x8xi32>
-  // CHECK-NEXT:   %[[SELECT:.*]] = mhlo.select %[[BCST0]], %[[BCST1]], %[[BCST2]] : tensor<2x?x8xi1>, tensor<2x?x8xi32>
-  // CHECK-NEXT:   shape.assuming_yield %[[SELECT]] : tensor<2x?x8xi32>
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return %[[ASSUME]] : tensor<2x?x8xi32>
-  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x?x8xi32>, tensor<2x8x8xi32>) -> tensor<2x?x8xi32>
-  func.return %0: tensor<2x?x8xi32>
-}
-
-// -----
-// Verifies that broadcast_dimensions validity checks are valid.
-// CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
-func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that broadcast_dimensions validity checks are valid.
-// CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
-func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-  // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64>} : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that invalid broadcast dimensions are rejected.
-func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-  // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Verifies that invalid broadcast dimensions are rejected.
-func.func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-  // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  func.return %0 : tensor<1x4xf32>
-}
-
-// -----
-// Note that broadcast_add is used as a proxy for all of the template
-// expansions. Tests below merely verify that the op has an expansion.
-// CHECK-LABEL: @andWithoutBroadcast
-func.func @andWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.and %arg0, %arg1
-  %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @atan2WithoutBroadcast
-func.func @atan2WithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.atan2 %arg0, %arg1
-  %0 = chlo.broadcast_atan2 %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @compareWithoutBroadcast
-func.func @compareWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-  // CHECK: mhlo.compare EQ, %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @complexWithoutBroadcast
-func.func @complexWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-  // CHECK: mhlo.complex %arg0, %arg1 : tensor<4xcomplex<f32>>
-  %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
-  func.return %0 : tensor<4xcomplex<f32>>
-}
-
-// -----
-// CHECK-LABEL: @divideWithoutBroadcast
-func.func @divideWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.divide %arg0, %arg1
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @maximumWithoutBroadcast
-func.func @maximumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.maximum %arg0, %arg1
-  %0 = chlo.broadcast_maximum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @minimumWithoutBroadcast
-func.func @minimumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.minimum %arg0, %arg1
-  %0 = chlo.broadcast_minimum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @multiplyWithoutBroadcast
-func.func @multiplyWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.multiply %arg0, %arg1
-  %0 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @orWithoutBroadcast
-func.func @orWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.or %arg0, %arg1
-  %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @powerWithoutBroadcast
-func.func @powerWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.power %arg0, %arg1
-  %0 = chlo.broadcast_power %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @remainderWithoutBroadcast
-func.func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.remainder %arg0, %arg1
-  %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @shift_leftWithoutBroadcast
-func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_left %arg0, %arg1
-  %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
-func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_right_arithmetic %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @shift_right_logicalWithoutBroadcast
-func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK: mhlo.shift_right_logical %arg0, %arg1
-  %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  func.return %0 : tensor<4xi32>
-}
-
-// -----
-// CHECK-LABEL: @subWithoutBroadcast
-func.func @subWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: mhlo.subtract %arg0, %arg1
-  %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @xorWithoutBroadcast
-func.func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-  // CHECK: mhlo.xor %arg0, %arg1
-  %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-  func.return %0 : tensor<4xi1>
-}
-
-// -----
-// CHECK-LABEL: @NextAfterWithoutBroadcast
-// CHECK-SAME: (%[[LHS:.*]]: tensor<4xf32>, %[[RHS:.*]]: tensor<4xf32>)
-func.func @NextAfterWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.next_after %[[LHS]], %[[RHS]]
-  %0 = chlo.broadcast_next_after %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @PolygammaWithoutBroadcast
-// CHECK-SAME: (%[[LHS:.*]]: tensor<4xf32>, %[[RHS:.*]]: tensor<4xf32>)
-func.func @PolygammaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.polygamma %[[LHS]], %[[RHS]]
-  %0 = chlo.broadcast_polygamma %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
-
-// -----
-// CHECK-LABEL: @ZetaWithoutBroadcast
-func.func @ZetaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-    -> tensor<4xf32> {
-  // CHECK: chlo.zeta %arg0, %arg1
-  %0 = chlo.broadcast_zeta %arg0, %arg1
-      : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  func.return %0 : tensor<4xf32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir
deleted file mode 100644
index 2a22006f2c9834..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_no_broadcasts.mlir
+++ /dev/null
@@ -1,11 +0,0 @@
-// RUN: mlir-hlo-opt -chlo-legalize-to-hlo="legalize-broadcasts=false" %s | FileCheck %s
-
-// CHECK-LABEL: atan_static
-// CHECK-SAME: %[[ARG:.*]]: tensor<2x3x4xf32>
-func.func @atan_static(%arg0: tensor<2x3x4xf32>) -> tuple<tensor<2x3x4xf32>> {
-  // CHECK: %[[CST:.*]] = mhlo.constant dense<1.000000e+00> : tensor<2x3x4xf32>
-  // CHECK: mhlo.atan2 %[[ARG]], %[[CST]] : tensor<2x3x4xf32>
-  %0 = chlo.atan %arg0 : tensor<2x3x4xf32> -> tensor<2x3x4xf32>
-  %1 = "mhlo.tuple"(%0) : (tensor<2x3x4xf32>) -> tuple<tensor<2x3x4xf32>>
-  func.return %1 : tuple<tensor<2x3x4xf32>>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 44b10a5ebddaa9..27a2d6567f50d7 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s --dump-input-context=20
+// RUN: mlir-hlo-opt --chlo-legalize-to-high-level-mhlo --split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-HIGH-LEVEL
 
 // CHECK-LABEL: func.func @asin_bf16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<bf16>
@@ -262,6 +263,7 @@ func.func @conj(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xcomplex<f32>> {
 // CHECK-LABEL: @erf_f64
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
@@ -273,6 +275,7 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
 // CHECK-LABEL: @erf_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
@@ -284,6 +287,7 @@ func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @erf_f16
 // CHECK-SAME: %[[ARG:.*]]: tensor<f16>
 func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
@@ -295,6 +299,7 @@ func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-LABEL: @erf_bf16
 // CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
 func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
+  // CHECK-HIGH-LEVEL: mhlo.erf
   // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
@@ -1104,153 +1109,153 @@ func.func @digamma_f16(%arg : tensor<f16>) -> tensor<f16> {
 func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: %[[TMP_0:.*]] = mhlo.convert %[[X]] : (tensor<f16>) -> tensor<f32>
   // CHECK: %[[TMP_1:.*]] = mhlo.convert %[[Q]] : (tensor<f16>) -> tensor<f32>
-  // CHECK: %[[TMP_2:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_3:.*]] = mhlo.negate %[[TMP_0]]
-  // CHECK: %[[TMP_4:.*]] = mhlo.power %[[TMP_1]], %[[TMP_3]]
-  // CHECK: %[[TMP_5:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_6:.*]] = mhlo.add %[[TMP_1]], %[[TMP_5]]
-  // CHECK: %[[TMP_7:.*]] = mhlo.power %[[TMP_6]], %[[TMP_3]]
-  // CHECK: %[[TMP_8:.*]] = mhlo.add %[[TMP_4]], %[[TMP_7]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.add %[[TMP_6]], %[[TMP_5]]
-  // CHECK: %[[TMP_10:.*]] = mhlo.power %[[TMP_9]], %[[TMP_3]]
+  // CHECK-DAG: %[[TMP_2:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_3:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_4:.*]] = mhlo.negate %[[TMP_0]]
+  // CHECK: %[[TMP_5:.*]] = mhlo.power %[[TMP_1]], %[[TMP_4]]
+  // CHECK: %[[TMP_6:.*]] = mhlo.add %[[TMP_1]], %[[TMP_3]]
+  // CHECK: %[[TMP_7:.*]] = mhlo.power %[[TMP_6]], %[[TMP_4]]
+  // CHECK: %[[TMP_8:.*]] = mhlo.add %[[TMP_5]], %[[TMP_7]]
+  // CHECK: %[[TMP_9:.*]] = mhlo.add %[[TMP_6]], %[[TMP_3]]
+  // CHECK: %[[TMP_10:.*]] = mhlo.power %[[TMP_9]], %[[TMP_4]]
   // CHECK: %[[TMP_11:.*]] = mhlo.add %[[TMP_8]], %[[TMP_10]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.add %[[TMP_9]], %[[TMP_5]]
-  // CHECK: %[[TMP_13:.*]] = mhlo.power %[[TMP_12]], %[[TMP_3]]
+  // CHECK: %[[TMP_12:.*]] = mhlo.add %[[TMP_9]], %[[TMP_3]]
+  // CHECK: %[[TMP_13:.*]] = mhlo.power %[[TMP_12]], %[[TMP_4]]
   // CHECK: %[[TMP_14:.*]] = mhlo.add %[[TMP_11]], %[[TMP_13]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.add %[[TMP_12]], %[[TMP_5]]
-  // CHECK: %[[TMP_16:.*]] = mhlo.power %[[TMP_15]], %[[TMP_3]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.add %[[TMP_12]], %[[TMP_3]]
+  // CHECK: %[[TMP_16:.*]] = mhlo.power %[[TMP_15]], %[[TMP_4]]
   // CHECK: %[[TMP_17:.*]] = mhlo.add %[[TMP_14]], %[[TMP_16]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.add %[[TMP_15]], %[[TMP_5]]
-  // CHECK: %[[TMP_19:.*]] = mhlo.power %[[TMP_18]], %[[TMP_3]]
+  // CHECK: %[[TMP_18:.*]] = mhlo.add %[[TMP_15]], %[[TMP_3]]
+  // CHECK: %[[TMP_19:.*]] = mhlo.power %[[TMP_18]], %[[TMP_4]]
   // CHECK: %[[TMP_20:.*]] = mhlo.add %[[TMP_17]], %[[TMP_19]]
-  // CHECK: %[[TMP_21:.*]] = mhlo.add %[[TMP_18]], %[[TMP_5]]
-  // CHECK: %[[TMP_22:.*]] = mhlo.power %[[TMP_21]], %[[TMP_3]]
+  // CHECK: %[[TMP_21:.*]] = mhlo.add %[[TMP_18]], %[[TMP_3]]
+  // CHECK: %[[TMP_22:.*]] = mhlo.power %[[TMP_21]], %[[TMP_4]]
   // CHECK: %[[TMP_23:.*]] = mhlo.add %[[TMP_20]], %[[TMP_22]]
-  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_21]], %[[TMP_5]]
-  // CHECK: %[[TMP_25:.*]] = mhlo.power %[[TMP_24]], %[[TMP_3]]
+  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_21]], %[[TMP_3]]
+  // CHECK: %[[TMP_25:.*]] = mhlo.power %[[TMP_24]], %[[TMP_4]]
   // CHECK: %[[TMP_26:.*]] = mhlo.add %[[TMP_23]], %[[TMP_25]]
-  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_24]], %[[TMP_5]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.power %[[TMP_27]], %[[TMP_3]]
+  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_24]], %[[TMP_3]]
+  // CHECK: %[[TMP_28:.*]] = mhlo.power %[[TMP_27]], %[[TMP_4]]
   // CHECK: %[[TMP_29:.*]] = mhlo.add %[[TMP_26]], %[[TMP_28]]
-  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_27]], %[[TMP_5]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.power %[[TMP_30]], %[[TMP_3]]
+  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_27]], %[[TMP_3]]
+  // CHECK: %[[TMP_31:.*]] = mhlo.power %[[TMP_30]], %[[TMP_4]]
   // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_29]], %[[TMP_31]]
-  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_30]], %[[TMP_5]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.power %[[TMP_33]], %[[TMP_3]]
+  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_30]], %[[TMP_3]]
+  // CHECK: %[[TMP_34:.*]] = mhlo.power %[[TMP_33]], %[[TMP_4]]
   // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_36:.*]] = mhlo.subtract %[[TMP_0]], %[[TMP_35]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_33]]
-  // CHECK: %[[TMP_38:.*]] = mhlo.divide %[[TMP_37]], %[[TMP_36]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.add %[[TMP_32]], %[[TMP_38]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_33]]
-  // CHECK: %[[TMP_41:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_43:.*]] = mhlo.add %[[TMP_0]], %[[TMP_42]]
-  // CHECK: %[[TMP_44:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_45:.*]] = mhlo.add %[[TMP_0]], %[[TMP_44]]
-  // CHECK: %[[TMP_46:.*]] = mhlo.multiply %[[TMP_43]], %[[TMP_45]]
-  // CHECK: %[[TMP_47:.*]] = mhlo.constant dense<-1.39544646E-19>
-  // CHECK: %[[TMP_48:.*]] = mhlo.add %[[TMP_2]], %[[TMP_47]]
-  // CHECK: %[[TMP_49:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_48]]
-  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_46]], %[[TMP_49]]
-  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_0]], %[[TMP_51]]
-  // CHECK: %[[TMP_53:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_54:.*]] = mhlo.add %[[TMP_0]], %[[TMP_53]]
-  // CHECK: %[[TMP_55:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_54]]
-  // CHECK: %[[TMP_56:.*]] = mhlo.constant dense<5.50900303E-18>
-  // CHECK: %[[TMP_57:.*]] = mhlo.add %[[TMP_50]], %[[TMP_56]]
-  // CHECK: %[[TMP_58:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_57]]
-  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_58]]
-  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_0]], %[[TMP_60]]
-  // CHECK: %[[TMP_62:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_63:.*]] = mhlo.add %[[TMP_0]], %[[TMP_62]]
-  // CHECK: %[[TMP_64:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_63]]
-  // CHECK: %[[TMP_65:.*]] = mhlo.constant dense<-2.17486866E-16>
-  // CHECK: %[[TMP_66:.*]] = mhlo.add %[[TMP_59]], %[[TMP_65]]
-  // CHECK: %[[TMP_67:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_66]]
-  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_67]]
-  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_0]], %[[TMP_69]]
-  // CHECK: %[[TMP_71:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_72:.*]] = mhlo.add %[[TMP_0]], %[[TMP_71]]
-  // CHECK: %[[TMP_73:.*]] = mhlo.multiply %[[TMP_70]], %[[TMP_72]]
-  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<8.58606213E-15>
-  // CHECK: %[[TMP_75:.*]] = mhlo.add %[[TMP_68]], %[[TMP_74]]
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_75]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.multiply %[[TMP_73]], %[[TMP_76]]
-  // CHECK: %[[TMP_78:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_79:.*]] = mhlo.add %[[TMP_0]], %[[TMP_78]]
-  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_0]], %[[TMP_80]]
-  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_79]], %[[TMP_81]]
-  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<-3.3896803E-13>
-  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_77]], %[[TMP_83]]
-  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_84]]
-  // CHECK: %[[TMP_86:.*]] = mhlo.multiply %[[TMP_82]], %[[TMP_85]]
-  // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_88:.*]] = mhlo.add %[[TMP_0]], %[[TMP_87]]
-  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_0]], %[[TMP_89]]
-  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_88]], %[[TMP_90]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<1.33825364E-11>
-  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_86]], %[[TMP_92]]
-  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.multiply %[[TMP_91]], %[[TMP_94]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_0]], %[[TMP_96]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_0]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.multiply %[[TMP_97]], %[[TMP_99]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.constant dense<-5.28419031E-10>
-  // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_95]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_102]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.multiply %[[TMP_100]], %[[TMP_103]]
-  // CHECK: %[[TMP_105:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_0]], %[[TMP_105]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_0]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.multiply %[[TMP_106]], %[[TMP_108]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.constant dense<2.08767563E-8>
-  // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_104]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_111]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.multiply %[[TMP_109]], %[[TMP_112]]
-  // CHECK: %[[TMP_114:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_0]], %[[TMP_114]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_0]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.multiply %[[TMP_115]], %[[TMP_117]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.constant dense<-8.26719599E-7>
-  // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_113]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_120]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.multiply %[[TMP_118]], %[[TMP_121]]
-  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.add %[[TMP_0]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_126:.*]] = mhlo.add %[[TMP_0]], %[[TMP_125]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_124]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.constant dense<3.30687835E-5>
-  // CHECK: %[[TMP_129:.*]] = mhlo.add %[[TMP_122]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_129]]
-  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_0]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_0]], %[[TMP_134]]
-  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.constant dense<-0.00138888892>
-  // CHECK: %[[TMP_138:.*]] = mhlo.add %[[TMP_131]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_138]]
-  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_33]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<0.0833333358>
-  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_143]], %[[TMP_140]]
-  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.add %[[TMP_141]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_39]], %[[TMP_147]]
+  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_33]]
+  // CHECK: %[[TMP_37:.*]] = mhlo.subtract %[[TMP_0]], %[[TMP_35]]
+  // CHECK: %[[TMP_38:.*]] = mhlo.divide %[[TMP_36]], %[[TMP_37]]
+  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_33]]
+  // CHECK: %[[TMP_40:.*]] = mhlo.divide %[[TMP_3]], %[[TMP_39]]
+  // CHECK: %[[TMP_41:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_42:.*]] = mhlo.add %[[TMP_0]], %[[TMP_41]]
+  // CHECK: %[[TMP_43:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_44:.*]] = mhlo.add %[[TMP_0]], %[[TMP_43]]
+  // CHECK: %[[TMP_45:.*]] = mhlo.multiply %[[TMP_42]], %[[TMP_44]]
+  // CHECK: %[[TMP_46:.*]] = mhlo.constant dense<-1.39544646E-19>
+  // CHECK: %[[TMP_47:.*]] = mhlo.add %[[TMP_2]], %[[TMP_46]]
+  // CHECK: %[[TMP_48:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_47]]
+  // CHECK: %[[TMP_49:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_48]]
+  // CHECK: %[[TMP_50:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_51:.*]] = mhlo.add %[[TMP_0]], %[[TMP_50]]
+  // CHECK: %[[TMP_52:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_53:.*]] = mhlo.add %[[TMP_0]], %[[TMP_52]]
+  // CHECK: %[[TMP_54:.*]] = mhlo.multiply %[[TMP_51]], %[[TMP_53]]
+  // CHECK: %[[TMP_55:.*]] = mhlo.constant dense<5.50900303E-18>
+  // CHECK: %[[TMP_56:.*]] = mhlo.add %[[TMP_49]], %[[TMP_55]]
+  // CHECK: %[[TMP_57:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_56]]
+  // CHECK: %[[TMP_58:.*]] = mhlo.multiply %[[TMP_54]], %[[TMP_57]]
+  // CHECK: %[[TMP_59:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_60:.*]] = mhlo.add %[[TMP_0]], %[[TMP_59]]
+  // CHECK: %[[TMP_61:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_62:.*]] = mhlo.add %[[TMP_0]], %[[TMP_61]]
+  // CHECK: %[[TMP_63:.*]] = mhlo.multiply %[[TMP_60]], %[[TMP_62]]
+  // CHECK: %[[TMP_64:.*]] = mhlo.constant dense<-2.17486866E-16>
+  // CHECK: %[[TMP_65:.*]] = mhlo.add %[[TMP_58]], %[[TMP_64]]
+  // CHECK: %[[TMP_66:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_65]]
+  // CHECK: %[[TMP_67:.*]] = mhlo.multiply %[[TMP_63]], %[[TMP_66]]
+  // CHECK: %[[TMP_68:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_69:.*]] = mhlo.add %[[TMP_0]], %[[TMP_68]]
+  // CHECK: %[[TMP_70:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_71:.*]] = mhlo.add %[[TMP_0]], %[[TMP_70]]
+  // CHECK: %[[TMP_72:.*]] = mhlo.multiply %[[TMP_69]], %[[TMP_71]]
+  // CHECK: %[[TMP_73:.*]] = mhlo.constant dense<8.58606213E-15>
+  // CHECK: %[[TMP_74:.*]] = mhlo.add %[[TMP_67]], %[[TMP_73]]
+  // CHECK: %[[TMP_75:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_74]]
+  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_72]], %[[TMP_75]]
+  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_0]], %[[TMP_77]]
+  // CHECK: %[[TMP_79:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_80:.*]] = mhlo.add %[[TMP_0]], %[[TMP_79]]
+  // CHECK: %[[TMP_81:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_80]]
+  // CHECK: %[[TMP_82:.*]] = mhlo.constant dense<-3.3896803E-13>
+  // CHECK: %[[TMP_83:.*]] = mhlo.add %[[TMP_76]], %[[TMP_82]]
+  // CHECK: %[[TMP_84:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_83]]
+  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_84]]
+  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_0]], %[[TMP_86]]
+  // CHECK: %[[TMP_88:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_89:.*]] = mhlo.add %[[TMP_0]], %[[TMP_88]]
+  // CHECK: %[[TMP_90:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_89]]
+  // CHECK: %[[TMP_91:.*]] = mhlo.constant dense<1.33825364E-11>
+  // CHECK: %[[TMP_92:.*]] = mhlo.add %[[TMP_85]], %[[TMP_91]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_93]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_0]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_98:.*]] = mhlo.add %[[TMP_0]], %[[TMP_97]]
+  // CHECK: %[[TMP_99:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_98]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.constant dense<-5.28419031E-10>
+  // CHECK: %[[TMP_101:.*]] = mhlo.add %[[TMP_94]], %[[TMP_100]]
+  // CHECK: %[[TMP_102:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_101]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.multiply %[[TMP_99]], %[[TMP_102]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_0]], %[[TMP_104]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_0]], %[[TMP_106]]
+  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_105]], %[[TMP_107]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<2.08767563E-8>
+  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_103]], %[[TMP_109]]
+  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_110]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.multiply %[[TMP_108]], %[[TMP_111]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_0]], %[[TMP_113]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_0]], %[[TMP_115]]
+  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_114]], %[[TMP_116]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<-8.26719599E-7>
+  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_112]], %[[TMP_118]]
+  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_119]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.multiply %[[TMP_117]], %[[TMP_120]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_123:.*]] = mhlo.add %[[TMP_0]], %[[TMP_122]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_125:.*]] = mhlo.add %[[TMP_0]], %[[TMP_124]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.constant dense<3.30687835E-5>
+  // CHECK: %[[TMP_128:.*]] = mhlo.add %[[TMP_121]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_128]]
+  // CHECK: %[[TMP_130:.*]] = mhlo.multiply %[[TMP_126]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_0]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_134:.*]] = mhlo.add %[[TMP_0]], %[[TMP_133]]
+  // CHECK: %[[TMP_135:.*]] = mhlo.multiply %[[TMP_132]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.constant dense<-0.00138888892>
+  // CHECK: %[[TMP_137:.*]] = mhlo.add %[[TMP_130]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_40]], %[[TMP_137]]
+  // CHECK: %[[TMP_139:.*]] = mhlo.multiply %[[TMP_135]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_33]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.constant dense<0.0833333358>
+  // CHECK: %[[TMP_143:.*]] = mhlo.add %[[TMP_142]], %[[TMP_139]]
+  // CHECK: %[[TMP_144:.*]] = mhlo.multiply %[[TMP_141]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_140]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_34]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.add %[[TMP_32]], %[[TMP_38]]
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_147]], %[[TMP_146]]
   // CHECK: %[[TMP_149:.*]] = mhlo.abs %[[TMP_34]]
   // CHECK: %[[TMP_150:.*]] = mhlo.abs %[[TMP_32]]
   // CHECK: %[[TMP_151:.*]] = mhlo.constant dense<1.401300e-45>
@@ -1277,7 +1282,7 @@ func.func @zeta_f16(%arg0: tensor<f16>, %arg1: tensor<f16>) -> tensor<f16> {
   // CHECK: %[[TMP_172:.*]] = mhlo.and %[[TMP_169]], %[[TMP_171]] : tensor<i1>
   // CHECK: %[[TMP_173:.*]] = mhlo.select %[[TMP_172]], %[[TMP_163]], %[[TMP_155]]
   // CHECK: %[[TMP_174:.*]] = mhlo.select %[[TMP_166]], %[[TMP_173]], %[[TMP_162]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.compare EQ, %[[TMP_0]], %[[TMP_5]], NOTYPE
+  // CHECK: %[[TMP_175:.*]] = mhlo.compare EQ, %[[TMP_0]], %[[TMP_3]], NOTYPE
   // CHECK: %[[TMP_176:.*]] = mhlo.select %[[TMP_175]], %[[TMP_163]], %[[TMP_174]]
   // CHECK: %[[TMP_177:.*]] = mhlo.convert %[[TMP_176]] : (tensor<f32>) -> tensor<f16>
   %0 = chlo.zeta %arg0, %arg1 : tensor<f16>, tensor<f16> -> tensor<f16>
@@ -1379,153 +1384,153 @@ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32>
   // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<0x7F800000>
   // CHECK: %[[TMP_88:.*]] = mhlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
   // CHECK: %[[TMP_89:.*]] = mhlo.exponential %[[TMP_88]]
-  // CHECK: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_91:.*]] = mhlo.negate %[[TMP_5]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.power %[[ARG1]], %[[TMP_91]]
-  // CHECK: %[[TMP_93:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_91]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_92]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_93]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_91]]
+  // CHECK-DAG: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_91:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_92:.*]] = mhlo.negate %[[TMP_5]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.power %[[ARG1]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_91]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_92]]
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_93]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_91]]
+  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_92]]
   // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_96]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_93]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_91]]
+  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_92]]
   // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_99]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_93]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_92]]
   // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_102]], %[[TMP_104]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_93]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_92]]
   // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_105]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_93]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_92]]
   // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_108]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_93]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_92]]
   // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_111]], %[[TMP_113]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_93]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_92]]
   // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_114]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_93]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_92]]
   // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_117]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_93]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_91]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_92]]
   // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_125]], %[[TMP_124]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.divide %[[TMP_93]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_131:.*]] = mhlo.add %[[TMP_5]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_5]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_131]], %[[TMP_133]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<-1.39544646E-19>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_90]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_136]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_134]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_140:.*]] = mhlo.add %[[TMP_5]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_5]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.multiply %[[TMP_140]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<5.50900303E-18>
-  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_138]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_143]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_149:.*]] = mhlo.add %[[TMP_5]], %[[TMP_148]]
-  // CHECK: %[[TMP_150:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_151:.*]] = mhlo.add %[[TMP_5]], %[[TMP_150]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.multiply %[[TMP_149]], %[[TMP_151]]
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<-2.17486866E-16>
-  // CHECK: %[[TMP_154:.*]] = mhlo.add %[[TMP_147]], %[[TMP_153]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_154]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.multiply %[[TMP_152]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_158:.*]] = mhlo.add %[[TMP_5]], %[[TMP_157]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_160:.*]] = mhlo.add %[[TMP_5]], %[[TMP_159]]
-  // CHECK: %[[TMP_161:.*]] = mhlo.multiply %[[TMP_158]], %[[TMP_160]]
-  // CHECK: %[[TMP_162:.*]] = mhlo.constant dense<8.58606213E-15>
-  // CHECK: %[[TMP_163:.*]] = mhlo.add %[[TMP_156]], %[[TMP_162]]
-  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_163]]
-  // CHECK: %[[TMP_165:.*]] = mhlo.multiply %[[TMP_161]], %[[TMP_164]]
-  // CHECK: %[[TMP_166:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_167:.*]] = mhlo.add %[[TMP_5]], %[[TMP_166]]
-  // CHECK: %[[TMP_168:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_169:.*]] = mhlo.add %[[TMP_5]], %[[TMP_168]]
-  // CHECK: %[[TMP_170:.*]] = mhlo.multiply %[[TMP_167]], %[[TMP_169]]
-  // CHECK: %[[TMP_171:.*]] = mhlo.constant dense<-3.3896803E-13>
-  // CHECK: %[[TMP_172:.*]] = mhlo.add %[[TMP_165]], %[[TMP_171]]
-  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_172]]
-  // CHECK: %[[TMP_174:.*]] = mhlo.multiply %[[TMP_170]], %[[TMP_173]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_176:.*]] = mhlo.add %[[TMP_5]], %[[TMP_175]]
-  // CHECK: %[[TMP_177:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_178:.*]] = mhlo.add %[[TMP_5]], %[[TMP_177]]
-  // CHECK: %[[TMP_179:.*]] = mhlo.multiply %[[TMP_176]], %[[TMP_178]]
-  // CHECK: %[[TMP_180:.*]] = mhlo.constant dense<1.33825364E-11>
-  // CHECK: %[[TMP_181:.*]] = mhlo.add %[[TMP_174]], %[[TMP_180]]
-  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_181]]
-  // CHECK: %[[TMP_183:.*]] = mhlo.multiply %[[TMP_179]], %[[TMP_182]]
-  // CHECK: %[[TMP_184:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_185:.*]] = mhlo.add %[[TMP_5]], %[[TMP_184]]
-  // CHECK: %[[TMP_186:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_187:.*]] = mhlo.add %[[TMP_5]], %[[TMP_186]]
-  // CHECK: %[[TMP_188:.*]] = mhlo.multiply %[[TMP_185]], %[[TMP_187]]
-  // CHECK: %[[TMP_189:.*]] = mhlo.constant dense<-5.28419031E-10>
-  // CHECK: %[[TMP_190:.*]] = mhlo.add %[[TMP_183]], %[[TMP_189]]
-  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_190]]
-  // CHECK: %[[TMP_192:.*]] = mhlo.multiply %[[TMP_188]], %[[TMP_191]]
-  // CHECK: %[[TMP_193:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_194:.*]] = mhlo.add %[[TMP_5]], %[[TMP_193]]
-  // CHECK: %[[TMP_195:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_196:.*]] = mhlo.add %[[TMP_5]], %[[TMP_195]]
-  // CHECK: %[[TMP_197:.*]] = mhlo.multiply %[[TMP_194]], %[[TMP_196]]
-  // CHECK: %[[TMP_198:.*]] = mhlo.constant dense<2.08767563E-8>
-  // CHECK: %[[TMP_199:.*]] = mhlo.add %[[TMP_192]], %[[TMP_198]]
-  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_199]]
-  // CHECK: %[[TMP_201:.*]] = mhlo.multiply %[[TMP_197]], %[[TMP_200]]
-  // CHECK: %[[TMP_202:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_203:.*]] = mhlo.add %[[TMP_5]], %[[TMP_202]]
-  // CHECK: %[[TMP_204:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_205:.*]] = mhlo.add %[[TMP_5]], %[[TMP_204]]
-  // CHECK: %[[TMP_206:.*]] = mhlo.multiply %[[TMP_203]], %[[TMP_205]]
-  // CHECK: %[[TMP_207:.*]] = mhlo.constant dense<-8.26719599E-7>
-  // CHECK: %[[TMP_208:.*]] = mhlo.add %[[TMP_201]], %[[TMP_207]]
-  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_208]]
-  // CHECK: %[[TMP_210:.*]] = mhlo.multiply %[[TMP_206]], %[[TMP_209]]
-  // CHECK: %[[TMP_211:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_212:.*]] = mhlo.add %[[TMP_5]], %[[TMP_211]]
-  // CHECK: %[[TMP_213:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_214:.*]] = mhlo.add %[[TMP_5]], %[[TMP_213]]
-  // CHECK: %[[TMP_215:.*]] = mhlo.multiply %[[TMP_212]], %[[TMP_214]]
-  // CHECK: %[[TMP_216:.*]] = mhlo.constant dense<3.30687835E-5>
-  // CHECK: %[[TMP_217:.*]] = mhlo.add %[[TMP_210]], %[[TMP_216]]
-  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_217]]
-  // CHECK: %[[TMP_219:.*]] = mhlo.multiply %[[TMP_215]], %[[TMP_218]]
-  // CHECK: %[[TMP_220:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_221:.*]] = mhlo.add %[[TMP_5]], %[[TMP_220]]
-  // CHECK: %[[TMP_222:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_223:.*]] = mhlo.add %[[TMP_5]], %[[TMP_222]]
-  // CHECK: %[[TMP_224:.*]] = mhlo.multiply %[[TMP_221]], %[[TMP_223]]
-  // CHECK: %[[TMP_225:.*]] = mhlo.constant dense<-0.00138888892>
-  // CHECK: %[[TMP_226:.*]] = mhlo.add %[[TMP_219]], %[[TMP_225]]
-  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_226]]
-  // CHECK: %[[TMP_228:.*]] = mhlo.multiply %[[TMP_224]], %[[TMP_227]]
-  // CHECK: %[[TMP_229:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_230:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
-  // CHECK: %[[TMP_231:.*]] = mhlo.constant dense<0.0833333358>
-  // CHECK: %[[TMP_232:.*]] = mhlo.add %[[TMP_231]], %[[TMP_228]]
-  // CHECK: %[[TMP_233:.*]] = mhlo.multiply %[[TMP_230]], %[[TMP_232]]
-  // CHECK: %[[TMP_234:.*]] = mhlo.add %[[TMP_229]], %[[TMP_233]]
-  // CHECK: %[[TMP_235:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_234]]
-  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_127]], %[[TMP_235]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
+  // CHECK: %[[TMP_125:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_124]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
+  // CHECK: %[[TMP_128:.*]] = mhlo.divide %[[TMP_91]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_5]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_5]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_132]]
+  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<-1.39544646E-19>
+  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_90]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_135]]
+  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_5]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.add %[[TMP_5]], %[[TMP_140]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_141]]
+  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<5.50900303E-18>
+  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_137]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_5]], %[[TMP_147]]
+  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_150:.*]] = mhlo.add %[[TMP_5]], %[[TMP_149]]
+  // CHECK: %[[TMP_151:.*]] = mhlo.multiply %[[TMP_148]], %[[TMP_150]]
+  // CHECK: %[[TMP_152:.*]] = mhlo.constant dense<-2.17486866E-16>
+  // CHECK: %[[TMP_153:.*]] = mhlo.add %[[TMP_146]], %[[TMP_152]]
+  // CHECK: %[[TMP_154:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_153]]
+  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_151]], %[[TMP_154]]
+  // CHECK: %[[TMP_156:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_157:.*]] = mhlo.add %[[TMP_5]], %[[TMP_156]]
+  // CHECK: %[[TMP_158:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_159:.*]] = mhlo.add %[[TMP_5]], %[[TMP_158]]
+  // CHECK: %[[TMP_160:.*]] = mhlo.multiply %[[TMP_157]], %[[TMP_159]]
+  // CHECK: %[[TMP_161:.*]] = mhlo.constant dense<8.58606213E-15>
+  // CHECK: %[[TMP_162:.*]] = mhlo.add %[[TMP_155]], %[[TMP_161]]
+  // CHECK: %[[TMP_163:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_162]]
+  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_160]], %[[TMP_163]]
+  // CHECK: %[[TMP_165:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_166:.*]] = mhlo.add %[[TMP_5]], %[[TMP_165]]
+  // CHECK: %[[TMP_167:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_168:.*]] = mhlo.add %[[TMP_5]], %[[TMP_167]]
+  // CHECK: %[[TMP_169:.*]] = mhlo.multiply %[[TMP_166]], %[[TMP_168]]
+  // CHECK: %[[TMP_170:.*]] = mhlo.constant dense<-3.3896803E-13>
+  // CHECK: %[[TMP_171:.*]] = mhlo.add %[[TMP_164]], %[[TMP_170]]
+  // CHECK: %[[TMP_172:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_171]]
+  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_169]], %[[TMP_172]]
+  // CHECK: %[[TMP_174:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_175:.*]] = mhlo.add %[[TMP_5]], %[[TMP_174]]
+  // CHECK: %[[TMP_176:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_177:.*]] = mhlo.add %[[TMP_5]], %[[TMP_176]]
+  // CHECK: %[[TMP_178:.*]] = mhlo.multiply %[[TMP_175]], %[[TMP_177]]
+  // CHECK: %[[TMP_179:.*]] = mhlo.constant dense<1.33825364E-11>
+  // CHECK: %[[TMP_180:.*]] = mhlo.add %[[TMP_173]], %[[TMP_179]]
+  // CHECK: %[[TMP_181:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_180]]
+  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_178]], %[[TMP_181]]
+  // CHECK: %[[TMP_183:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_184:.*]] = mhlo.add %[[TMP_5]], %[[TMP_183]]
+  // CHECK: %[[TMP_185:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_186:.*]] = mhlo.add %[[TMP_5]], %[[TMP_185]]
+  // CHECK: %[[TMP_187:.*]] = mhlo.multiply %[[TMP_184]], %[[TMP_186]]
+  // CHECK: %[[TMP_188:.*]] = mhlo.constant dense<-5.28419031E-10>
+  // CHECK: %[[TMP_189:.*]] = mhlo.add %[[TMP_182]], %[[TMP_188]]
+  // CHECK: %[[TMP_190:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_189]]
+  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_187]], %[[TMP_190]]
+  // CHECK: %[[TMP_192:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_193:.*]] = mhlo.add %[[TMP_5]], %[[TMP_192]]
+  // CHECK: %[[TMP_194:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_195:.*]] = mhlo.add %[[TMP_5]], %[[TMP_194]]
+  // CHECK: %[[TMP_196:.*]] = mhlo.multiply %[[TMP_193]], %[[TMP_195]]
+  // CHECK: %[[TMP_197:.*]] = mhlo.constant dense<2.08767563E-8>
+  // CHECK: %[[TMP_198:.*]] = mhlo.add %[[TMP_191]], %[[TMP_197]]
+  // CHECK: %[[TMP_199:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_198]]
+  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_196]], %[[TMP_199]]
+  // CHECK: %[[TMP_201:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_202:.*]] = mhlo.add %[[TMP_5]], %[[TMP_201]]
+  // CHECK: %[[TMP_203:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_204:.*]] = mhlo.add %[[TMP_5]], %[[TMP_203]]
+  // CHECK: %[[TMP_205:.*]] = mhlo.multiply %[[TMP_202]], %[[TMP_204]]
+  // CHECK: %[[TMP_206:.*]] = mhlo.constant dense<-8.26719599E-7>
+  // CHECK: %[[TMP_207:.*]] = mhlo.add %[[TMP_200]], %[[TMP_206]]
+  // CHECK: %[[TMP_208:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_207]]
+  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_205]], %[[TMP_208]]
+  // CHECK: %[[TMP_210:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_211:.*]] = mhlo.add %[[TMP_5]], %[[TMP_210]]
+  // CHECK: %[[TMP_212:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_213:.*]] = mhlo.add %[[TMP_5]], %[[TMP_212]]
+  // CHECK: %[[TMP_214:.*]] = mhlo.multiply %[[TMP_211]], %[[TMP_213]]
+  // CHECK: %[[TMP_215:.*]] = mhlo.constant dense<3.30687835E-5>
+  // CHECK: %[[TMP_216:.*]] = mhlo.add %[[TMP_209]], %[[TMP_215]]
+  // CHECK: %[[TMP_217:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_216]]
+  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_214]], %[[TMP_217]]
+  // CHECK: %[[TMP_219:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_220:.*]] = mhlo.add %[[TMP_5]], %[[TMP_219]]
+  // CHECK: %[[TMP_221:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_222:.*]] = mhlo.add %[[TMP_5]], %[[TMP_221]]
+  // CHECK: %[[TMP_223:.*]] = mhlo.multiply %[[TMP_220]], %[[TMP_222]]
+  // CHECK: %[[TMP_224:.*]] = mhlo.constant dense<-0.00138888892>
+  // CHECK: %[[TMP_225:.*]] = mhlo.add %[[TMP_218]], %[[TMP_224]]
+  // CHECK: %[[TMP_226:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_225]]
+  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_223]], %[[TMP_226]]
+  // CHECK: %[[TMP_228:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_229:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
+  // CHECK: %[[TMP_230:.*]] = mhlo.constant dense<0.0833333358>
+  // CHECK: %[[TMP_231:.*]] = mhlo.add %[[TMP_230]], %[[TMP_227]]
+  // CHECK: %[[TMP_232:.*]] = mhlo.multiply %[[TMP_229]], %[[TMP_231]]
+  // CHECK: %[[TMP_233:.*]] = mhlo.add %[[TMP_228]], %[[TMP_232]]
+  // CHECK: %[[TMP_234:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_233]]
+  // CHECK: %[[TMP_235:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
+  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_235]], %[[TMP_234]]
   // CHECK: %[[TMP_237:.*]] = mhlo.abs %[[TMP_122]]
   // CHECK: %[[TMP_238:.*]] = mhlo.abs %[[TMP_120]]
   // CHECK: %[[TMP_239:.*]] = mhlo.constant dense<1.401300e-45>
@@ -1552,7 +1557,7 @@ func.func @polygamma_f32(%lhs : tensor<f32>, %rhs : tensor<f32>) -> tensor<f32>
   // CHECK: %[[TMP_260:.*]] = mhlo.and %[[TMP_257]], %[[TMP_259]]
   // CHECK: %[[TMP_261:.*]] = mhlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
   // CHECK: %[[TMP_262:.*]] = mhlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
-  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
+  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
   // CHECK: %[[TMP_264:.*]] = mhlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
   // CHECK: %[[TMP_265:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_89]]
   // CHECK: %[[TMP_266:.*]] = mhlo.multiply %[[TMP_265]], %[[TMP_264]]
@@ -1766,153 +1771,153 @@ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64>
   // CHECK: %[[TMP_87:.*]] = mhlo.constant dense<0x7FF0000000000000>
   // CHECK: %[[TMP_88:.*]] = mhlo.select %[[TMP_86]], %[[TMP_87]], %[[TMP_83]]
   // CHECK: %[[TMP_89:.*]] = mhlo.exponential %[[TMP_88]]
-  // CHECK: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_91:.*]] = mhlo.negate %[[TMP_5]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.power %[[ARG1]], %[[TMP_91]]
-  // CHECK: %[[TMP_93:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_93]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_91]]
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_92]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_93]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_91]]
+  // CHECK-DAG: %[[TMP_90:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK-DAG: %[[TMP_91:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_92:.*]] = mhlo.negate %[[TMP_5]]
+  // CHECK: %[[TMP_93:.*]] = mhlo.power %[[ARG1]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.add %[[ARG1]], %[[TMP_91]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.power %[[TMP_94]], %[[TMP_92]]
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_93]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.add %[[TMP_94]], %[[TMP_91]]
+  // CHECK: %[[TMP_98:.*]] = mhlo.power %[[TMP_97]], %[[TMP_92]]
   // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_96]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_93]]
-  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.add %[[TMP_97]], %[[TMP_91]]
+  // CHECK: %[[TMP_101:.*]] = mhlo.power %[[TMP_100]], %[[TMP_92]]
   // CHECK: %[[TMP_102:.*]] = mhlo.add %[[TMP_99]], %[[TMP_101]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_93]]
-  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.add %[[TMP_100]], %[[TMP_91]]
+  // CHECK: %[[TMP_104:.*]] = mhlo.power %[[TMP_103]], %[[TMP_92]]
   // CHECK: %[[TMP_105:.*]] = mhlo.add %[[TMP_102]], %[[TMP_104]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_93]]
-  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.add %[[TMP_103]], %[[TMP_91]]
+  // CHECK: %[[TMP_107:.*]] = mhlo.power %[[TMP_106]], %[[TMP_92]]
   // CHECK: %[[TMP_108:.*]] = mhlo.add %[[TMP_105]], %[[TMP_107]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_93]]
-  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.add %[[TMP_106]], %[[TMP_91]]
+  // CHECK: %[[TMP_110:.*]] = mhlo.power %[[TMP_109]], %[[TMP_92]]
   // CHECK: %[[TMP_111:.*]] = mhlo.add %[[TMP_108]], %[[TMP_110]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_93]]
-  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.add %[[TMP_109]], %[[TMP_91]]
+  // CHECK: %[[TMP_113:.*]] = mhlo.power %[[TMP_112]], %[[TMP_92]]
   // CHECK: %[[TMP_114:.*]] = mhlo.add %[[TMP_111]], %[[TMP_113]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_93]]
-  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.add %[[TMP_112]], %[[TMP_91]]
+  // CHECK: %[[TMP_116:.*]] = mhlo.power %[[TMP_115]], %[[TMP_92]]
   // CHECK: %[[TMP_117:.*]] = mhlo.add %[[TMP_114]], %[[TMP_116]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_93]]
-  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.add %[[TMP_115]], %[[TMP_91]]
+  // CHECK: %[[TMP_119:.*]] = mhlo.power %[[TMP_118]], %[[TMP_92]]
   // CHECK: %[[TMP_120:.*]] = mhlo.add %[[TMP_117]], %[[TMP_119]]
-  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_93]]
-  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_91]]
+  // CHECK: %[[TMP_121:.*]] = mhlo.add %[[TMP_118]], %[[TMP_91]]
+  // CHECK: %[[TMP_122:.*]] = mhlo.power %[[TMP_121]], %[[TMP_92]]
   // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_124:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_125]], %[[TMP_124]]
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.divide %[[TMP_93]], %[[TMP_128]]
-  // CHECK: %[[TMP_130:.*]] = mhlo.constant dense<2.200000e+01>
-  // CHECK: %[[TMP_131:.*]] = mhlo.add %[[TMP_5]], %[[TMP_130]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<2.100000e+01>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_5]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_131]], %[[TMP_133]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<-1.3954464685812522E-19>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_90]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_136]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.multiply %[[TMP_134]], %[[TMP_137]]
-  // CHECK: %[[TMP_139:.*]] = mhlo.constant dense<2.000000e+01>
-  // CHECK: %[[TMP_140:.*]] = mhlo.add %[[TMP_5]], %[[TMP_139]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<1.900000e+01>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_5]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.multiply %[[TMP_140]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<5.5090028283602295E-18>
-  // CHECK: %[[TMP_145:.*]] = mhlo.add %[[TMP_138]], %[[TMP_144]]
-  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_145]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.multiply %[[TMP_143]], %[[TMP_146]]
-  // CHECK: %[[TMP_148:.*]] = mhlo.constant dense<1.800000e+01>
-  // CHECK: %[[TMP_149:.*]] = mhlo.add %[[TMP_5]], %[[TMP_148]]
-  // CHECK: %[[TMP_150:.*]] = mhlo.constant dense<1.700000e+01>
-  // CHECK: %[[TMP_151:.*]] = mhlo.add %[[TMP_5]], %[[TMP_150]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.multiply %[[TMP_149]], %[[TMP_151]]
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<-2.1748686985580617E-16>
-  // CHECK: %[[TMP_154:.*]] = mhlo.add %[[TMP_147]], %[[TMP_153]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_154]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.multiply %[[TMP_152]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.constant dense<1.600000e+01>
-  // CHECK: %[[TMP_158:.*]] = mhlo.add %[[TMP_5]], %[[TMP_157]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.constant dense<1.500000e+01>
-  // CHECK: %[[TMP_160:.*]] = mhlo.add %[[TMP_5]], %[[TMP_159]]
-  // CHECK: %[[TMP_161:.*]] = mhlo.multiply %[[TMP_158]], %[[TMP_160]]
-  // CHECK: %[[TMP_162:.*]] = mhlo.constant dense<8.5860620562778452E-15>
-  // CHECK: %[[TMP_163:.*]] = mhlo.add %[[TMP_156]], %[[TMP_162]]
-  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_163]]
-  // CHECK: %[[TMP_165:.*]] = mhlo.multiply %[[TMP_161]], %[[TMP_164]]
-  // CHECK: %[[TMP_166:.*]] = mhlo.constant dense<1.400000e+01>
-  // CHECK: %[[TMP_167:.*]] = mhlo.add %[[TMP_5]], %[[TMP_166]]
-  // CHECK: %[[TMP_168:.*]] = mhlo.constant dense<1.300000e+01>
-  // CHECK: %[[TMP_169:.*]] = mhlo.add %[[TMP_5]], %[[TMP_168]]
-  // CHECK: %[[TMP_170:.*]] = mhlo.multiply %[[TMP_167]], %[[TMP_169]]
-  // CHECK: %[[TMP_171:.*]] = mhlo.constant dense<-3.3896802963225832E-13>
-  // CHECK: %[[TMP_172:.*]] = mhlo.add %[[TMP_165]], %[[TMP_171]]
-  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_172]]
-  // CHECK: %[[TMP_174:.*]] = mhlo.multiply %[[TMP_170]], %[[TMP_173]]
-  // CHECK: %[[TMP_175:.*]] = mhlo.constant dense<1.200000e+01>
-  // CHECK: %[[TMP_176:.*]] = mhlo.add %[[TMP_5]], %[[TMP_175]]
-  // CHECK: %[[TMP_177:.*]] = mhlo.constant dense<1.100000e+01>
-  // CHECK: %[[TMP_178:.*]] = mhlo.add %[[TMP_5]], %[[TMP_177]]
-  // CHECK: %[[TMP_179:.*]] = mhlo.multiply %[[TMP_176]], %[[TMP_178]]
-  // CHECK: %[[TMP_180:.*]] = mhlo.constant dense<1.3382536530684679E-11>
-  // CHECK: %[[TMP_181:.*]] = mhlo.add %[[TMP_174]], %[[TMP_180]]
-  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_181]]
-  // CHECK: %[[TMP_183:.*]] = mhlo.multiply %[[TMP_179]], %[[TMP_182]]
-  // CHECK: %[[TMP_184:.*]] = mhlo.constant dense<1.000000e+01>
-  // CHECK: %[[TMP_185:.*]] = mhlo.add %[[TMP_5]], %[[TMP_184]]
-  // CHECK: %[[TMP_186:.*]] = mhlo.constant dense<9.000000e+00>
-  // CHECK: %[[TMP_187:.*]] = mhlo.add %[[TMP_5]], %[[TMP_186]]
-  // CHECK: %[[TMP_188:.*]] = mhlo.multiply %[[TMP_185]], %[[TMP_187]]
-  // CHECK: %[[TMP_189:.*]] = mhlo.constant dense<-5.2841901386874932E-10>
-  // CHECK: %[[TMP_190:.*]] = mhlo.add %[[TMP_183]], %[[TMP_189]]
-  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_190]]
-  // CHECK: %[[TMP_192:.*]] = mhlo.multiply %[[TMP_188]], %[[TMP_191]]
-  // CHECK: %[[TMP_193:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_194:.*]] = mhlo.add %[[TMP_5]], %[[TMP_193]]
-  // CHECK: %[[TMP_195:.*]] = mhlo.constant dense<7.000000e+00>
-  // CHECK: %[[TMP_196:.*]] = mhlo.add %[[TMP_5]], %[[TMP_195]]
-  // CHECK: %[[TMP_197:.*]] = mhlo.multiply %[[TMP_194]], %[[TMP_196]]
-  // CHECK: %[[TMP_198:.*]] = mhlo.constant dense<2.08767569878681E-8>
-  // CHECK: %[[TMP_199:.*]] = mhlo.add %[[TMP_192]], %[[TMP_198]]
-  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_199]]
-  // CHECK: %[[TMP_201:.*]] = mhlo.multiply %[[TMP_197]], %[[TMP_200]]
-  // CHECK: %[[TMP_202:.*]] = mhlo.constant dense<6.000000e+00>
-  // CHECK: %[[TMP_203:.*]] = mhlo.add %[[TMP_5]], %[[TMP_202]]
-  // CHECK: %[[TMP_204:.*]] = mhlo.constant dense<5.000000e+00>
-  // CHECK: %[[TMP_205:.*]] = mhlo.add %[[TMP_5]], %[[TMP_204]]
-  // CHECK: %[[TMP_206:.*]] = mhlo.multiply %[[TMP_203]], %[[TMP_205]]
-  // CHECK: %[[TMP_207:.*]] = mhlo.constant dense<-8.2671957671957675E-7>
-  // CHECK: %[[TMP_208:.*]] = mhlo.add %[[TMP_201]], %[[TMP_207]]
-  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_208]]
-  // CHECK: %[[TMP_210:.*]] = mhlo.multiply %[[TMP_206]], %[[TMP_209]]
-  // CHECK: %[[TMP_211:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_212:.*]] = mhlo.add %[[TMP_5]], %[[TMP_211]]
-  // CHECK: %[[TMP_213:.*]] = mhlo.constant dense<3.000000e+00>
-  // CHECK: %[[TMP_214:.*]] = mhlo.add %[[TMP_5]], %[[TMP_213]]
-  // CHECK: %[[TMP_215:.*]] = mhlo.multiply %[[TMP_212]], %[[TMP_214]]
-  // CHECK: %[[TMP_216:.*]] = mhlo.constant dense<3.3068783068783071E-5>
-  // CHECK: %[[TMP_217:.*]] = mhlo.add %[[TMP_210]], %[[TMP_216]]
-  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_217]]
-  // CHECK: %[[TMP_219:.*]] = mhlo.multiply %[[TMP_215]], %[[TMP_218]]
-  // CHECK: %[[TMP_220:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_221:.*]] = mhlo.add %[[TMP_5]], %[[TMP_220]]
-  // CHECK: %[[TMP_222:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_223:.*]] = mhlo.add %[[TMP_5]], %[[TMP_222]]
-  // CHECK: %[[TMP_224:.*]] = mhlo.multiply %[[TMP_221]], %[[TMP_223]]
-  // CHECK: %[[TMP_225:.*]] = mhlo.constant dense<-0.0013888888888888889>
-  // CHECK: %[[TMP_226:.*]] = mhlo.add %[[TMP_219]], %[[TMP_225]]
-  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_129]], %[[TMP_226]]
-  // CHECK: %[[TMP_228:.*]] = mhlo.multiply %[[TMP_224]], %[[TMP_227]]
-  // CHECK: %[[TMP_229:.*]] = mhlo.constant dense<5.000000e-01>
-  // CHECK: %[[TMP_230:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
-  // CHECK: %[[TMP_231:.*]] = mhlo.constant dense<0.083333333333333329>
-  // CHECK: %[[TMP_232:.*]] = mhlo.add %[[TMP_231]], %[[TMP_228]]
-  // CHECK: %[[TMP_233:.*]] = mhlo.multiply %[[TMP_230]], %[[TMP_232]]
-  // CHECK: %[[TMP_234:.*]] = mhlo.add %[[TMP_229]], %[[TMP_233]]
-  // CHECK: %[[TMP_235:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_234]]
-  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_127]], %[[TMP_235]]
+  // CHECK: %[[TMP_124:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_121]]
+  // CHECK: %[[TMP_125:.*]] = mhlo.subtract %[[TMP_5]], %[[TMP_123]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.divide %[[TMP_124]], %[[TMP_125]]
+  // CHECK: %[[TMP_127:.*]] = mhlo.multiply %[[TMP_121]], %[[TMP_121]]
+  // CHECK: %[[TMP_128:.*]] = mhlo.divide %[[TMP_91]], %[[TMP_127]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<2.200000e+01>
+  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_5]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.constant dense<2.100000e+01>
+  // CHECK: %[[TMP_132:.*]] = mhlo.add %[[TMP_5]], %[[TMP_131]]
+  // CHECK: %[[TMP_133:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_132]]
+  // CHECK: %[[TMP_134:.*]] = mhlo.constant dense<-1.3954464685812522E-19>
+  // CHECK: %[[TMP_135:.*]] = mhlo.add %[[TMP_90]], %[[TMP_134]]
+  // CHECK: %[[TMP_136:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_135]]
+  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_136]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<2.000000e+01>
+  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_5]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.constant dense<1.900000e+01>
+  // CHECK: %[[TMP_141:.*]] = mhlo.add %[[TMP_5]], %[[TMP_140]]
+  // CHECK: %[[TMP_142:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_141]]
+  // CHECK: %[[TMP_143:.*]] = mhlo.constant dense<5.5090028283602295E-18>
+  // CHECK: %[[TMP_144:.*]] = mhlo.add %[[TMP_137]], %[[TMP_143]]
+  // CHECK: %[[TMP_145:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_144]]
+  // CHECK: %[[TMP_146:.*]] = mhlo.multiply %[[TMP_142]], %[[TMP_145]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<1.800000e+01>
+  // CHECK: %[[TMP_148:.*]] = mhlo.add %[[TMP_5]], %[[TMP_147]]
+  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<1.700000e+01>
+  // CHECK: %[[TMP_150:.*]] = mhlo.add %[[TMP_5]], %[[TMP_149]]
+  // CHECK: %[[TMP_151:.*]] = mhlo.multiply %[[TMP_148]], %[[TMP_150]]
+  // CHECK: %[[TMP_152:.*]] = mhlo.constant dense<-2.1748686985580617E-16>
+  // CHECK: %[[TMP_153:.*]] = mhlo.add %[[TMP_146]], %[[TMP_152]]
+  // CHECK: %[[TMP_154:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_153]]
+  // CHECK: %[[TMP_155:.*]] = mhlo.multiply %[[TMP_151]], %[[TMP_154]]
+  // CHECK: %[[TMP_156:.*]] = mhlo.constant dense<1.600000e+01>
+  // CHECK: %[[TMP_157:.*]] = mhlo.add %[[TMP_5]], %[[TMP_156]]
+  // CHECK: %[[TMP_158:.*]] = mhlo.constant dense<1.500000e+01>
+  // CHECK: %[[TMP_159:.*]] = mhlo.add %[[TMP_5]], %[[TMP_158]]
+  // CHECK: %[[TMP_160:.*]] = mhlo.multiply %[[TMP_157]], %[[TMP_159]]
+  // CHECK: %[[TMP_161:.*]] = mhlo.constant dense<8.5860620562778452E-15>
+  // CHECK: %[[TMP_162:.*]] = mhlo.add %[[TMP_155]], %[[TMP_161]]
+  // CHECK: %[[TMP_163:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_162]]
+  // CHECK: %[[TMP_164:.*]] = mhlo.multiply %[[TMP_160]], %[[TMP_163]]
+  // CHECK: %[[TMP_165:.*]] = mhlo.constant dense<1.400000e+01>
+  // CHECK: %[[TMP_166:.*]] = mhlo.add %[[TMP_5]], %[[TMP_165]]
+  // CHECK: %[[TMP_167:.*]] = mhlo.constant dense<1.300000e+01>
+  // CHECK: %[[TMP_168:.*]] = mhlo.add %[[TMP_5]], %[[TMP_167]]
+  // CHECK: %[[TMP_169:.*]] = mhlo.multiply %[[TMP_166]], %[[TMP_168]]
+  // CHECK: %[[TMP_170:.*]] = mhlo.constant dense<-3.3896802963225832E-13>
+  // CHECK: %[[TMP_171:.*]] = mhlo.add %[[TMP_164]], %[[TMP_170]]
+  // CHECK: %[[TMP_172:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_171]]
+  // CHECK: %[[TMP_173:.*]] = mhlo.multiply %[[TMP_169]], %[[TMP_172]]
+  // CHECK: %[[TMP_174:.*]] = mhlo.constant dense<1.200000e+01>
+  // CHECK: %[[TMP_175:.*]] = mhlo.add %[[TMP_5]], %[[TMP_174]]
+  // CHECK: %[[TMP_176:.*]] = mhlo.constant dense<1.100000e+01>
+  // CHECK: %[[TMP_177:.*]] = mhlo.add %[[TMP_5]], %[[TMP_176]]
+  // CHECK: %[[TMP_178:.*]] = mhlo.multiply %[[TMP_175]], %[[TMP_177]]
+  // CHECK: %[[TMP_179:.*]] = mhlo.constant dense<1.3382536530684679E-11>
+  // CHECK: %[[TMP_180:.*]] = mhlo.add %[[TMP_173]], %[[TMP_179]]
+  // CHECK: %[[TMP_181:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_180]]
+  // CHECK: %[[TMP_182:.*]] = mhlo.multiply %[[TMP_178]], %[[TMP_181]]
+  // CHECK: %[[TMP_183:.*]] = mhlo.constant dense<1.000000e+01>
+  // CHECK: %[[TMP_184:.*]] = mhlo.add %[[TMP_5]], %[[TMP_183]]
+  // CHECK: %[[TMP_185:.*]] = mhlo.constant dense<9.000000e+00>
+  // CHECK: %[[TMP_186:.*]] = mhlo.add %[[TMP_5]], %[[TMP_185]]
+  // CHECK: %[[TMP_187:.*]] = mhlo.multiply %[[TMP_184]], %[[TMP_186]]
+  // CHECK: %[[TMP_188:.*]] = mhlo.constant dense<-5.2841901386874932E-10>
+  // CHECK: %[[TMP_189:.*]] = mhlo.add %[[TMP_182]], %[[TMP_188]]
+  // CHECK: %[[TMP_190:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_189]]
+  // CHECK: %[[TMP_191:.*]] = mhlo.multiply %[[TMP_187]], %[[TMP_190]]
+  // CHECK: %[[TMP_192:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_193:.*]] = mhlo.add %[[TMP_5]], %[[TMP_192]]
+  // CHECK: %[[TMP_194:.*]] = mhlo.constant dense<7.000000e+00>
+  // CHECK: %[[TMP_195:.*]] = mhlo.add %[[TMP_5]], %[[TMP_194]]
+  // CHECK: %[[TMP_196:.*]] = mhlo.multiply %[[TMP_193]], %[[TMP_195]]
+  // CHECK: %[[TMP_197:.*]] = mhlo.constant dense<2.08767569878681E-8>
+  // CHECK: %[[TMP_198:.*]] = mhlo.add %[[TMP_191]], %[[TMP_197]]
+  // CHECK: %[[TMP_199:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_198]]
+  // CHECK: %[[TMP_200:.*]] = mhlo.multiply %[[TMP_196]], %[[TMP_199]]
+  // CHECK: %[[TMP_201:.*]] = mhlo.constant dense<6.000000e+00>
+  // CHECK: %[[TMP_202:.*]] = mhlo.add %[[TMP_5]], %[[TMP_201]]
+  // CHECK: %[[TMP_203:.*]] = mhlo.constant dense<5.000000e+00>
+  // CHECK: %[[TMP_204:.*]] = mhlo.add %[[TMP_5]], %[[TMP_203]]
+  // CHECK: %[[TMP_205:.*]] = mhlo.multiply %[[TMP_202]], %[[TMP_204]]
+  // CHECK: %[[TMP_206:.*]] = mhlo.constant dense<-8.2671957671957675E-7>
+  // CHECK: %[[TMP_207:.*]] = mhlo.add %[[TMP_200]], %[[TMP_206]]
+  // CHECK: %[[TMP_208:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_207]]
+  // CHECK: %[[TMP_209:.*]] = mhlo.multiply %[[TMP_205]], %[[TMP_208]]
+  // CHECK: %[[TMP_210:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_211:.*]] = mhlo.add %[[TMP_5]], %[[TMP_210]]
+  // CHECK: %[[TMP_212:.*]] = mhlo.constant dense<3.000000e+00>
+  // CHECK: %[[TMP_213:.*]] = mhlo.add %[[TMP_5]], %[[TMP_212]]
+  // CHECK: %[[TMP_214:.*]] = mhlo.multiply %[[TMP_211]], %[[TMP_213]]
+  // CHECK: %[[TMP_215:.*]] = mhlo.constant dense<3.3068783068783071E-5>
+  // CHECK: %[[TMP_216:.*]] = mhlo.add %[[TMP_209]], %[[TMP_215]]
+  // CHECK: %[[TMP_217:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_216]]
+  // CHECK: %[[TMP_218:.*]] = mhlo.multiply %[[TMP_214]], %[[TMP_217]]
+  // CHECK: %[[TMP_219:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_220:.*]] = mhlo.add %[[TMP_5]], %[[TMP_219]]
+  // CHECK: %[[TMP_221:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_222:.*]] = mhlo.add %[[TMP_5]], %[[TMP_221]]
+  // CHECK: %[[TMP_223:.*]] = mhlo.multiply %[[TMP_220]], %[[TMP_222]]
+  // CHECK: %[[TMP_224:.*]] = mhlo.constant dense<-0.0013888888888888889>
+  // CHECK: %[[TMP_225:.*]] = mhlo.add %[[TMP_218]], %[[TMP_224]]
+  // CHECK: %[[TMP_226:.*]] = mhlo.multiply %[[TMP_128]], %[[TMP_225]]
+  // CHECK: %[[TMP_227:.*]] = mhlo.multiply %[[TMP_223]], %[[TMP_226]]
+  // CHECK: %[[TMP_228:.*]] = mhlo.constant dense<5.000000e-01>
+  // CHECK: %[[TMP_229:.*]] = mhlo.divide %[[TMP_5]], %[[TMP_121]]
+  // CHECK: %[[TMP_230:.*]] = mhlo.constant dense<0.083333333333333329>
+  // CHECK: %[[TMP_231:.*]] = mhlo.add %[[TMP_230]], %[[TMP_227]]
+  // CHECK: %[[TMP_232:.*]] = mhlo.multiply %[[TMP_229]], %[[TMP_231]]
+  // CHECK: %[[TMP_233:.*]] = mhlo.add %[[TMP_228]], %[[TMP_232]]
+  // CHECK: %[[TMP_234:.*]] = mhlo.multiply %[[TMP_122]], %[[TMP_233]]
+  // CHECK: %[[TMP_235:.*]] = mhlo.add %[[TMP_120]], %[[TMP_126]]
+  // CHECK: %[[TMP_236:.*]] = mhlo.add %[[TMP_235]], %[[TMP_234]]
   // CHECK: %[[TMP_237:.*]] = mhlo.abs %[[TMP_122]]
   // CHECK: %[[TMP_238:.*]] = mhlo.abs %[[TMP_120]]
   // CHECK: %[[TMP_239:.*]] = mhlo.constant dense<4.940660e-324>
@@ -1939,7 +1944,7 @@ func.func @polygamma_f64(%lhs : tensor<f64>, %rhs : tensor<f64>) -> tensor<f64>
   // CHECK: %[[TMP_260:.*]] = mhlo.and %[[TMP_257]], %[[TMP_259]]
   // CHECK: %[[TMP_261:.*]] = mhlo.select %[[TMP_260]], %[[TMP_251]], %[[TMP_243]]
   // CHECK: %[[TMP_262:.*]] = mhlo.select %[[TMP_254]], %[[TMP_261]], %[[TMP_250]]
-  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_93]], NOTYPE
+  // CHECK: %[[TMP_263:.*]] = mhlo.compare EQ, %[[TMP_5]], %[[TMP_91]], NOTYPE
   // CHECK: %[[TMP_264:.*]] = mhlo.select %[[TMP_263]], %[[TMP_251]], %[[TMP_262]]
   // CHECK: %[[TMP_265:.*]] = mhlo.multiply %[[TMP_4]], %[[TMP_89]]
   // CHECK: %[[TMP_266:.*]] = mhlo.multiply %[[TMP_265]], %[[TMP_264]]
@@ -2256,12 +2261,9 @@ func.func @next_after_f32(%x: tensor<2xf32>, %y: tensor<2xf32>) -> tensor<2xf32>
 // CHECK-LABEL: @tan_f16
 // CHECK-SAME: (%[[ARG:.*]]: tensor<f16>)
 func.func @tan_f16(%arg : tensor<f16>) -> tensor<f16> {
-  // %[[TMP_0:.*]] = mhlo.convert [[ARG]] : (tensor<f16>) -> tensor<f32>
-  // %[[TMP_1:.*]] = mhlo.sine %[[TMP_0]]
-  // %[[TMP_2:.*]] = mhlo.cosine %[[TMP_0]]
-  // %[[TMP_3:.*]] = mhlo.divide %[[TMP_1]], %[[TMP_2]]
-  // %[[TMP_4:.*]] = mhlo.convert %[[TMP_3]] : (tensor<f32>) -> tensor<f16>
-  // return %[[TMP_4]] : tensor<f16>
+  // CHECK-HIGH-LEVEL: mhlo.tan
+  // CHECK: %[[RESULT:.*]] = mhlo.tan %[[ARG]] : tensor<f16>
+  // CHECK: return %[[RESULT]]
   %1 = chlo.tan %arg : tensor<f16> -> tensor<f16>
   func.return %1 : tensor<f16>
 }
@@ -2271,10 +2273,9 @@ func.func @tan_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-LABEL: @tan_f32
 // CHECK-SAME: (%[[ARG:.*]]: tensor<f32>)
 func.func @tan_f32(%arg : tensor<f32>) -> tensor<f32> {
-  // %[[TMP_0:.*]] = mhlo.sine %[[ARG]]
-  // %[[TMP_1:.*]] = mhlo.cosine %[[ARG]]
-  // %[[TMP_2:.*]] = mhlo.divide %[[TMP_0]], %[[TMP_1]]
-  // return %[[TMP_2]] : tensor<f32>
+  // CHECK-HIGH-LEVEL: mhlo.tan
+  // CHECK: %[[RESULT:.*]] = mhlo.tan %[[ARG]] : tensor<f32>
+  // CHECK: return %[[RESULT]]
   %1 = chlo.tan %arg : tensor<f32> -> tensor<f32>
   func.return %1 : tensor<f32>
 }
@@ -2284,6 +2285,7 @@ func.func @tan_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @top_k
 // CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
 func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
+  // CHECK-HIGH-LEVEL: mhlo.topk
   // CHECK: %values, %indices = mhlo.topk(%arg0, k = 8, largest = true) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
@@ -2295,6 +2297,7 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 // CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
 // CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
 func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+  // CHECK-HIGH-LEVEL: mhlo.topk
   // CHECK: %values, %indices = mhlo.topk(%arg0, k = 2, largest = true) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
deleted file mode 100644
index 0a9b7eef41657e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
+++ /dev/null
@@ -1,276 +0,0 @@
-// RUN: mlir-hlo-opt --chlo-legalize-to-hlo-basis-ops --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s 
-
-// -----
-
-// CHECK-LABEL: @erf_f64
-// CHECK-SAME: %[[ARG:.*]]: tensor<f64>
-func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
-  // CHECK: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.constant dense<9.6049737398705161>
-  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_3]], %[[TMP_0]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<90.026019720384269>
-  // CHECK: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_0]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2232.0053459468431>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_0]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<7003.3251411280507>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_0]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<55592.301301039493>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[ARG]], %[[TMP_16]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_20]], %[[TMP_0]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.constant dense<33.561714164750313>
-  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_22]], %[[TMP_23]]
-  // CHECK: %[[TMP_25:.*]] = mhlo.multiply %[[TMP_24]], %[[TMP_0]]
-  // CHECK: %[[TMP_26:.*]] = mhlo.constant dense<521.35794978015269>
-  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_25]], %[[TMP_26]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.multiply %[[TMP_27]], %[[TMP_0]]
-  // CHECK: %[[TMP_29:.*]] = mhlo.constant dense<4594.3238297098014>
-  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_28]], %[[TMP_29]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.multiply %[[TMP_30]], %[[TMP_0]]
-  // CHECK: %[[TMP_32:.*]] = mhlo.constant dense<22629.000061389095>
-  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_31]], %[[TMP_32]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_0]]
-  // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<49267.394260863592>
-  // CHECK: %[[TMP_36:.*]] = mhlo.add %[[TMP_34]], %[[TMP_35]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.divide %[[TMP_17]], %[[TMP_36]]
-  // CHECK: %[[TMP_38:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.negate %[[TMP_39]]
-  // CHECK: %[[TMP_41:.*]] = mhlo.exponential %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_45:.*]] = mhlo.constant dense<2.4619698147353052E-10>
-  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_42]]
-  // CHECK: %[[TMP_48:.*]] = mhlo.constant dense<0.56418956483106886>
-  // CHECK: %[[TMP_49:.*]] = mhlo.add %[[TMP_47]], %[[TMP_48]]
-  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_49]], %[[TMP_42]]
-  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<7.4632105644226989>
-  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_50]], %[[TMP_51]]
-  // CHECK: %[[TMP_53:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_42]]
-  // CHECK: %[[TMP_54:.*]] = mhlo.constant dense<48.637197098568137>
-  // CHECK: %[[TMP_55:.*]] = mhlo.add %[[TMP_53]], %[[TMP_54]]
-  // CHECK: %[[TMP_56:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_42]]
-  // CHECK: %[[TMP_57:.*]] = mhlo.constant dense<196.5208329560771>
-  // CHECK: %[[TMP_58:.*]] = mhlo.add %[[TMP_56]], %[[TMP_57]]
-  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_58]], %[[TMP_42]]
-  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<526.44519499547732>
-  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_59]], %[[TMP_60]]
-  // CHECK: %[[TMP_62:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_42]]
-  // CHECK: %[[TMP_63:.*]] = mhlo.constant dense<934.52852717195765>
-  // CHECK: %[[TMP_64:.*]] = mhlo.add %[[TMP_62]], %[[TMP_63]]
-  // CHECK: %[[TMP_65:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_42]]
-  // CHECK: %[[TMP_66:.*]] = mhlo.constant dense<1027.5518868951572>
-  // CHECK: %[[TMP_67:.*]] = mhlo.add %[[TMP_65]], %[[TMP_66]]
-  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_67]], %[[TMP_42]]
-  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<557.53533536939938>
-  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_68]], %[[TMP_69]]
-  // CHECK: %[[TMP_71:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_70]]
-  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_74]], %[[TMP_42]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<13.228195115474499>
-  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_76]], %[[TMP_77]]
-  // CHECK: %[[TMP_79:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_42]]
-  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<86.707214088598973>
-  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_79]], %[[TMP_80]]
-  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_42]]
-  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<354.93777888781989>
-  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_82]], %[[TMP_83]]
-  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_84]], %[[TMP_42]]
-  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<975.70850174320549>
-  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_85]], %[[TMP_86]]
-  // CHECK: %[[TMP_88:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_42]]
-  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1823.9091668790973>
-  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_88]], %[[TMP_89]]
-  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_42]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<2246.3376081871097>
-  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_91]], %[[TMP_92]]
-  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_93]], %[[TMP_42]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1656.6630919416134>
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_94]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_42]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<557.53534081772773>
-  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_97]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.divide %[[TMP_71]], %[[TMP_99]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.constant dense<0.56418958354775506>
-  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_103]], %[[TMP_42]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<1.275366707599781>
-  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_105]], %[[TMP_106]]
-  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_107]], %[[TMP_42]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<5.0190504225118051>
-  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_108]], %[[TMP_109]]
-  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_110]], %[[TMP_42]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.constant dense<6.160210979930536>
-  // CHECK: %[[TMP_113:.*]] = mhlo.add %[[TMP_111]], %[[TMP_112]]
-  // CHECK: %[[TMP_114:.*]] = mhlo.multiply %[[TMP_113]], %[[TMP_42]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<7.4097426995044895>
-  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_114]], %[[TMP_115]]
-  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_116]], %[[TMP_42]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<2.9788666537210022>
-  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_117]], %[[TMP_118]]
-  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_119]]
-  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_42]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.constant dense<2.2605286322011726>
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_125]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_42]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<9.3960352493800147>
-  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_128]], %[[TMP_129]]
-  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_42]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<12.048953980809666>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_131]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_42]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<17.081445074756591>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_134]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_42]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<9.6089680906328585>
-  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_137]], %[[TMP_138]]
-  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_42]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<3.3690764510008151>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_140]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.divide %[[TMP_120]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_145:.*]] = mhlo.compare LT, %[[TMP_42]], %[[TMP_144]], NOTYPE
-  // CHECK: %[[TMP_146:.*]] = mhlo.select %[[TMP_145]], %[[TMP_100]], %[[TMP_143]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<-709.78271289338397>
-  // CHECK: %[[TMP_148:.*]] = mhlo.compare LT, %[[TMP_40]], %[[TMP_147]], NOTYPE
-  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_150:.*]] = mhlo.select %[[TMP_148]], %[[TMP_149]], %[[TMP_146]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.compare LT, %[[ARG]], %[[TMP_149]], NOTYPE
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_154:.*]] = mhlo.subtract %[[TMP_153]], %[[TMP_150]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.select %[[TMP_152]], %[[TMP_154]], %[[TMP_150]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.subtract %[[TMP_38]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.compare LT, %[[TMP_157]], %[[TMP_38]], NOTYPE
-  // CHECK: %[[RESULT:.*]] = mhlo.select %[[TMP_159]], %[[TMP_37]], %[[TMP_156]]
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
-  func.return %1 : tensor<f64>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_f32
-// CHECK-SAME: %[[ARG:.*]]: tensor<f32>
-func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
-  // CHECK-DAG: %[[TMP_0:.*]] = mhlo.constant dense<-4.000000e+00>
-  // CHECK-DAG: %[[TMP_1:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_2:.*]] = mhlo.clamp %[[TMP_0]], %[[ARG]], %[[TMP_1]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_2]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<-2.72614237E-10>
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.77068146E-8>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<-2.10102394E-6>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_3]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<-5.69250624E-5>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[TMP_16]], %[[TMP_3]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<-7.34990637E-4>
-  // CHECK: %[[TMP_19:.*]] = mhlo.add %[[TMP_17]], %[[TMP_18]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.multiply %[[TMP_19]], %[[TMP_3]]
-  // CHECK: %[[TMP_21:.*]] = mhlo.constant dense<-2.954600e-03>
-  // CHECK: %[[TMP_22:.*]] = mhlo.add %[[TMP_20]], %[[TMP_21]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.multiply %[[TMP_22]], %[[TMP_3]]
-  // CHECK: %[[TMP_24:.*]] = mhlo.constant dense<-0.0160960332>
-  // CHECK: %[[TMP_25:.*]] = mhlo.add %[[TMP_23]], %[[TMP_24]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.constant dense<-1.45660715E-5>
-  // CHECK: %[[TMP_30:.*]] = mhlo.multiply %[[TMP_28]], %[[TMP_3]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.constant dense<-2.13374049E-4>
-  // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_30]], %[[TMP_31]]
-  // CHECK: %[[TMP_33:.*]] = mhlo.multiply %[[TMP_32]], %[[TMP_3]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.constant dense<-0.00168282702>
-  // CHECK: %[[TMP_35:.*]] = mhlo.add %[[TMP_33]], %[[TMP_34]]
-  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<-0.00737332925>
-  // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_38]], %[[TMP_3]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.constant dense<-0.0142647391>
-  // CHECK: %[[TMP_41:.*]] = mhlo.add %[[TMP_39]], %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_25]]
-  // CHECK: %[[TMP_43:.*]] = mhlo.divide %[[TMP_42]], %[[TMP_41]]
-  // CHECK-DAG: %[[TMP_44:.*]] = mhlo.constant dense<-1.000000e+00>
-  // CHECK-DAG: %[[TMP_45:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[RESULT:.*]] = mhlo.clamp %[[TMP_44]], %[[TMP_43]], %[[TMP_45]]
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
-  func.return %1 : tensor<f32>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_f16
-// CHECK-SAME: %[[ARG:.*]]: tensor<f16>
-func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<f16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<f16>
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
-  func.return %1 : tensor<f16>
-}
-
-// -----
-
-// CHECK-LABEL: @erf_bf16
-// CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
-func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<bf16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<bf16>
-  // CHECK: return %[[RESULT]]
-  %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
-  func.return %1 : tensor<bf16>
-}
-
-
-// CHECK-LABEL: @top_k
-// CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
-func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
-  // CHECK:      %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64}
-  // CHECK-NEXT: %[[SORT:.*]]:2 = "mhlo.sort"(%[[ARG]], %[[IOTA]]) ({
-  // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>, %{{.*}}: tensor<i32>, %{{.*}}: tensor<i32>):
-  // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare GT, %[[LHS]], %[[RHS]], TOTALORDER
-  // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[SORT]]#0) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[SORT]]#1) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: return %[[VAL]], %[[IDX]]
-  %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
-  func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @dyn_top_k
-// CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
-// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
-func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
-  // CHECK-NEXT: [[DIM_0_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 0 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_0_I32x1:%.*]] = mhlo.reshape [[DIM_0_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_1_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_1_I32x1:%.*]] = mhlo.reshape [[DIM_1_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_2_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 2 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_2_I32x1:%.*]] = mhlo.reshape [[DIM_2_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[IOTA_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[DIM_2_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[K_I32:%.*]] = mhlo.constant dense<2> : tensor<i32>
-  // CHECK-NEXT: [[K_I32x1:%.*]] = mhlo.reshape [[K_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[RESULT_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[K_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[IOTA:%.*]] = "mhlo.dynamic_iota"([[IOTA_SHAPE]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x?xi32>
-  // CHECK-NEXT: [[SORT:%.*]]:2 = "mhlo.sort"([[ARG]], [[IOTA]]) ({
-  // CHECK-NEXT: ^bb0([[ARG_1:%.*]]: tensor<i1>, [[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i32>, [[ARG_4:%.*]]: tensor<i32>):
-  // CHECK-NEXT:   [[CMP:%.*]] = mhlo.compare  GT, [[ARG_1]], [[ARG_2]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
-  // CHECK-NEXT:   mhlo.return [[CMP]] : tensor<i1>
-  // CHECK-NEXT: }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x?xi1>, tensor<?x5x?xi32>) -> (tensor<?x5x?xi1>, tensor<?x5x?xi32>)
-  // CHECK-NEXT: [[STARTS:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
-  // CHECK-NEXT: [[LIMITS:%.*]] = mhlo.convert [[RESULT_SHAPE]] : (tensor<3xi32>) -> tensor<3xi64>
-  // CHECK-NEXT: [[STRIDES:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
-  // CHECK-NEXT: [[VAL:%.*]] = mhlo.real_dynamic_slice [[SORT]]#0, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
-  // CHECK-NEXT: [[IDX:%.*]] = mhlo.real_dynamic_slice [[SORT]]#1, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
-  // CHECK-NEXT: return [[VAL]], [[IDX]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
-  %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
-  return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
index 8f59051c95a244..8c2e615baed7fe 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -chlo-legalize-to-hlo -mhlo-test-lower-complex | FileCheck %s
+// RUN: mlir-hlo-opt %s --mhlo-test-lower-complex | FileCheck %s
 
 // CHECK-LABEL: @add
 func.func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
index 1d5a45b047442c..01ac1b8d13bc4a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/mhlo-quant-legalize-to-int.mlir
@@ -1166,6 +1166,57 @@ func.func @dot_general_multiple_dynamic_dims(
 
 // -----
 
+// CHECK-LABEL: func @dot_general_per_channel
+func.func @dot_general_per_channel(
+    %arg0: tensor<?x2x!quant.uniform<i8:f32, 2.0:3>>,
+    %arg1: tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.0,4.0}>>
+  ) -> tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>> {
+  // CHECK: %[[DOT_RES:.*]] = "mhlo.dot_general"
+  // CHECK-SAME: lhs_contracting_dimensions = [1]
+  // CHECK-SAME: rhs_contracting_dimensions = [0]>}
+
+  // Zero point offset contribution from RHS tensor * LHS ZP.
+
+  // CHECK: %[[RHS_I32:.*]] = mhlo.convert %arg1 : (tensor<2x2xi8>)
+  // CHECK-SAME: -> tensor<2x2xi32>
+  // CHECK: %[[RHS_REDUCE_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[RHS_REDUCE:.*]] = mhlo.reduce(%[[RHS_I32]] init: %[[RHS_REDUCE_INIT]])
+  // CHECK-SAME: applies mhlo.add across dimensions = [0]
+  // CHECK-SAME: (tensor<2x2xi32>, tensor<i32>)
+  // CHECK-SAME: -> tensor<2xi32>
+  // CHECK: %[[RHS_ZP:.*]] = mhlo.constant dense<3> : tensor<i32>
+  // CHECK: %[[RHS_ZP_CONTRIB:.*]] = chlo.broadcast_multiply
+  // CHECK-SAME: %[[RHS_REDUCE]], %[[RHS_ZP]] :
+  // CHECK-SAME: (tensor<2xi32>, tensor<i32>) -> tensor<2xi32>
+
+  // Calculate output dynamic dims.
+  // CHECK: %[[DIM_1_1:.*]] = "mhlo.get_dimension_size"(%[[DOT_RES]])
+  // CHECK-SAME: {dimension = 0 : i64}
+  // CHECK: %[[DIM_1_2:.*]] = mhlo.convert %[[DIM_1_1]] : (tensor<i32>) -> tensor<i64>
+  // CHECK: %[[DIM_1:.*]] = mhlo.reshape %[[DIM_1_2]] : (tensor<i64>) -> tensor<1xi64>
+  // CHECK: %[[DIM_2:.*]] = mhlo.constant dense<2> : tensor<1xi64>
+  // CHECK: %[[OUTPUT_DIMS:.*]] = "mhlo.concatenate"
+  // CHECK-SAME: %[[DIM_1]], %[[DIM_2]]
+
+  // CHECK: %[[RHS_ZP_BCAST:.*]] = "mhlo.dynamic_broadcast_in_dim"
+  // CHECK-SAME: (%[[RHS_ZP_CONTRIB]], %[[OUTPUT_DIMS]])
+  // CHECK-SAME: broadcast_dimensions = dense<1>
+  // CHECK-SAME: (tensor<2xi32>, tensor<2xi64>) -> tensor<?x2xi32>
+  // CHECK: %[[ZPS_INIT:.*]] = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %[[ZP_TOTAL_2:.*]] = chlo.broadcast_subtract %[[ZPS_INIT]], %[[RHS_ZP_BCAST]]
+  // CHECK-SAME: (tensor<i32>, tensor<?x2xi32>) -> tensor<?x2xi32>
+  // CHECK: chlo.broadcast_add %[[DOT_RES]], %[[ZP_TOTAL_2]]
+  %0 = "mhlo.dot_general"(%arg0, %arg1) {
+      dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
+      rhs_contracting_dimensions = [0]>} : (
+    tensor<?x2x!quant.uniform<i8:f32, 2.0:3>>,
+    tensor<2x2x!quant.uniform<i8<-127:127>:f32:1, {3.0,4.0}>>
+  ) -> tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>>
+  return %0 : tensor<?x2x!quant.uniform<i32:f32:1, {6.0,8.0}>>
+}
+
+// -----
+
 // CHECK-LABEL: func @conv2d_dynamic
 func.func @conv2d_dynamic(
     %arg0: tensor<?x?x?x?x!quant.uniform<i8:f32, 2.000000e+00:4>>,
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index 4886bbe86385cd..9c9d2be9793e1a 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -1496,7 +1496,7 @@ func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor
 
 func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
   // @expected-error@+2 {{'mhlo.concatenate' op failed to infer returned types}}
-  // expected-error@+1 {{shapes of operand (0) and (1) do not match at non-concat index: (1, 3) != (2, 2) at non-concat index 1}}
+  // expected-error@+1 {{shapes of operand (0) and (1) are not compatible at non-concat index 1: (1, 3) != (2, 2)}}
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
   func.return %0 : tensor<3x3xi32>
 }
@@ -6581,4 +6581,4 @@ func.func @composite_c4(%arg0: !mhlo.token) {
     decomposition = @foo
   } : (!mhlo.token) -> tensor<f32>
   func.return
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
index edc680fbf78999..764a93e5fb77ed 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/verifier_reduce_op.mlir
@@ -136,7 +136,7 @@ func.func @reduce_diferent_input_shapes(%arg0: tensor<2x3xf32>, %arg1: tensor<3x
 func.func @reduce_oob_dims(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
     -> (tensor<?xf32>) {
 
-  // expected-error@+1 {{Out-of-bounds dimension 2, expected to be less than the input-tensor rank 2}}
+  // expected-error@+1 {{Out-of-bounds dimension 2, expected to be in range [0, 2)}}
   %0 = "mhlo.reduce"(%arg0, %arg1) ({
 
   ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32> ):
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index b1024fb030eca8..93756798df52b1 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -138,6 +138,9 @@ struct ComputeOpAndFuncBufferizePass
         .insert<bufferization::BufferizationDialect, lmhlo::LmhloDialect,
                 linalg::LinalgDialect, memref::MemRefDialect, mhlo::MhloDialect,
                 shape::ShapeDialect, vector::VectorDialect>();
+    arith::registerBufferizableOpInterfaceExternalModels(registry);
+    mlir::bufferization::func_ext::
+        registerBufferizableOpInterfaceExternalModels(registry);
     linalg::registerBufferizableOpInterfaceExternalModels(registry);
     mhlo::registerBufferizableOpInterfaceExternalModels(registry);
     shape::registerBufferizableOpInterfaceExternalModels(registry);
@@ -153,8 +156,7 @@ struct ComputeOpAndFuncBufferizePass
     // will be migrated to BufferizableOpInterface-based bufferization.
     options.opFilter.allowDialect<bufferization::BufferizationDialect,
                                   linalg::LinalgDialect, mhlo::MhloDialect,
-                                  shape::ShapeDialect, tensor::TensorDialect,
-                                  vector::VectorDialect>();
+                                  shape::ShapeDialect, vector::VectorDialect>();
 
     if (failed(bufferization::bufferizeOp(getOperation(), options))) {
       signalPassFailure();
diff --git a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc b/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
index ba51904e2d5e4f..99bfb28760403a 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
diff --git a/third_party/xla/xla/parse_flags_from_env.cc b/third_party/xla/xla/parse_flags_from_env.cc
index 84ca13de4487cb..0f58671ebff7df 100644
--- a/third_party/xla/xla/parse_flags_from_env.cc
+++ b/third_party/xla/xla/parse_flags_from_env.cc
@@ -32,8 +32,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/parse_flags_from_env.h b/third_party/xla/xla/parse_flags_from_env.h
index e73a38421faf97..01d476f22fa3dc 100644
--- a/third_party/xla/xla/parse_flags_from_env.h
+++ b/third_party/xla/xla/parse_flags_from_env.h
@@ -51,8 +51,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/types.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/parse_flags_from_env_test.cc b/third_party/xla/xla/parse_flags_from_env_test.cc
index 01e82889130a29..f00cb309c12a96 100644
--- a/third_party/xla/xla/parse_flags_from_env_test.cc
+++ b/third_party/xla/xla/parse_flags_from_env_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/subprocess.h"
 #include "tsl/platform/test.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index e3f86a3287e59f..fe75981da57781 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -1,5 +1,3 @@
-# Placeholder: load py_proto_library
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -7,6 +5,9 @@ load(
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
+# Placeholder: load py_proto_library
+load("//xla:xla.bzl", "xla_cc_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = internal_visibility(["//xla:internal"]),
@@ -236,6 +237,7 @@ cc_library(
         ":executable_metadata_proto_cc",
         ":execute_options_proto_cc",
         ":pjrt_common",
+        ":pjrt_layout",
         "//xla:shape_layout",
         "//xla:shape_util",
         "//xla:status",
@@ -439,6 +441,7 @@ cc_library(
     visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":event_pool",
+        ":host_callback",
         ":local_device_state",
         ":metrics",
         ":mlir_to_hlo",
@@ -567,6 +570,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index a5fb6234887e48..f20f39c11c099e 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -1,4 +1,3 @@
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -8,6 +7,7 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -53,6 +53,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_custom_partitioner_extension_hdrs",
+    hdrs = ["pjrt_c_api_custom_partitioner_extension.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
 cc_library(
     name = "pjrt_c_api_wrapper_impl",
     srcs = ["pjrt_c_api_wrapper_impl.cc"],
@@ -166,6 +175,7 @@ cc_library(
     hdrs = ["pjrt_c_api_gpu_internal.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":pjrt_c_api_custom_partitioner_extension_hdrs",
         ":pjrt_c_api_gpu_extension_hdrs",
         ":pjrt_c_api_hdrs",
         ":pjrt_c_api_helpers",
@@ -186,6 +196,7 @@ cc_library(
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # To register GPU AOT compiler
+        "//xla/python:custom_partition_callback",
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index ca223ce6007b1e..cb9cb750d81940 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,25 +1,28 @@
 # PJRT C API changelog
 
-## 0.46
+## 0.47
+* Added ``PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner``.
+
+## 0.46 (Feb 29, 2024)
 * Update outdated struct sizes from previous changes to
   ``PJRT_Device_AddressableMemories_Args`` and ``PJRT_ExecuteOptions``.
 
-## 0.45
+## 0.45 (Feb 27, 2024)
 * Breaking changes
   * Added struct_size field to beginning of PJRT_Extension_Base. This is so
     forwards and backwards compatibility logic can be implemented with extension
     structs.
 
-## 0.44
+## 0.44 (Feb 26, 2024)
 * Changed all ``void*`` extension fields to have type ``PJRT_Extension_Base*``
 
-## 0.43
+## 0.43 (Feb 24, 2024)
 * Added some new fields to PJRT_Executable_GetCompiledMemoryStats
 
-## 0.42
+## 0.42 (Feb 13, 2024)
 * Renamed all ``priv`` fields to ``extension_start``
 
-## 0.41
+## 0.41 (Feb 13, 2024)
 * Renamed PJRT_Structure_Base to PJRT_Extension_Base
 * Renamed PJRT_Structure_Type to PJRT_Extension_Type (and similarly for enum fields)
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index c39b7636983e39..da1934e64f2e64 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -41,6 +41,7 @@ extern "C" {
 typedef enum {
   PJRT_Extension_Type_Gpu_Custom_Call = 0,
   PJRT_Extension_Type_Profiler,
+  PJRT_Extension_Type_Custom_Partitioner,
 } PJRT_Extension_Type;
 
 // PJRT_Extension_Base contains a type and a pointer to next
@@ -75,7 +76,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 46
+#define PJRT_API_MINOR 47
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
new file mode 100644
index 00000000000000..825734610b863c
--- /dev/null
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h
@@ -0,0 +1,134 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+#define XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "xla/pjrt/c/pjrt_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PJRT_API_CUSTOM_PARTITIONER_EXTENSION_VERSION 0
+
+struct JAX_CustomCallPartitioner_string {
+  const char* data;
+  size_t size;
+};
+
+struct JAX_CustomCallPartitioner_aval {
+  JAX_CustomCallPartitioner_string shape;
+  bool has_sharding;
+  JAX_CustomCallPartitioner_string sharding;
+};
+
+// General callback information containing api versions, the result error
+// message and the cleanup function to free any temporary memory that is backing
+// the results. Arguments are always owned by the caller, and results are owned
+// by the cleanup_fn. These should never be used directly. Args and results
+// should be serialized via the PopulateArgs, ReadArgs, PopulateResults,
+// ConsumeResults functions defined below.
+struct JAX_CustomCallPartitioner_version_and_error {
+  int64_t api_version;
+  void* data;  // out
+  // cleanup_fn cleans up any returned results. The caller must finish with all
+  // uses by the point the cleanup is called.
+  void (*cleanup_fn)(void* data);  // out
+  bool has_error;
+  PJRT_Error_Code code;                        // out
+  JAX_CustomCallPartitioner_string error_msg;  // out
+};
+
+struct JAX_CustomCallPartitioner_Partition_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_aval op_result;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  // out
+  JAX_CustomCallPartitioner_string mlir_module;
+  JAX_CustomCallPartitioner_string* args_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_InferShardingFromOperands_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  size_t num_args;
+  JAX_CustomCallPartitioner_aval* op_args;
+  JAX_CustomCallPartitioner_string result_shape;
+  JAX_CustomCallPartitioner_string backend_config;
+
+  bool has_result_sharding;
+  JAX_CustomCallPartitioner_string result_sharding;
+};
+
+struct JAX_CustomCallPartitioner_PropagateUserSharding_Args {
+  JAX_CustomCallPartitioner_version_and_error header;
+
+  JAX_CustomCallPartitioner_string backend_config;
+
+  JAX_CustomCallPartitioner_string result_shape;
+
+  JAX_CustomCallPartitioner_string result_sharding;  // inout
+};
+
+struct JAX_CustomCallPartitioner_Callbacks {
+  int64_t version;
+  void* private_data;
+  void (*dtor)(JAX_CustomCallPartitioner_Callbacks* data);
+  void (*partition)(JAX_CustomCallPartitioner_Callbacks* data,
+                    JAX_CustomCallPartitioner_Partition_Args* args);
+  void (*infer_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
+  void (*propagate_user_sharding)(
+      JAX_CustomCallPartitioner_Callbacks* data,
+      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
+  bool can_side_effecting_have_replicated_sharding;
+};
+
+struct PJRT_Register_Custom_Partitioner_Args {
+  size_t struct_size;
+  const char* name;  // lifetime of the call.
+  size_t name_size;
+  JAX_CustomCallPartitioner_Callbacks* callbacks;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Register_Custom_Partitioner_Args, callbacks);
+
+// Registers a custom partitioner.
+typedef PJRT_Error* PJRT_Register_Custom_Partitioner(
+    PJRT_Register_Custom_Partitioner_Args* args);
+
+typedef struct PJRT_Custom_Partitioner_Extension {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  PJRT_Extension_Base* next;
+  PJRT_Register_Custom_Partitioner* register_custom_partitioner;
+} PJRT_Custom_Partitioner_Extension;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Custom_Partitioner_Extension,
+                          register_custom_partitioner);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_PJRT_C_PJRT_C_API_CUSTOM_PARTITIONER_EXTENSION_H_
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index c687c7760c8752..211fcc5d538c03 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_gpu_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/c/pjrt_c_api_profiler_extension.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/custom_partition_callback.h"
 #include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/stream_executor/device_description.h"
@@ -198,6 +200,24 @@ PJRT_Profiler_Extension profiler_extension{
     /*profiler_api=*/&profiler_api,
 };
 
+PJRT_Error* PJRT_Register_Custom_Partitioner(
+    PJRT_Register_Custom_Partitioner_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Register_Custom_Partitioner_Args",
+      PJRT_Register_Custom_Partitioner_Args_STRUCT_SIZE, args->struct_size));
+  std::string name(args->name, args->name_size);
+  RegisterCustomCallPartitioner(
+      name, jax::CreateCApiCustomCallPartitioner(args->callbacks));
+  return nullptr;
+}
+
+PJRT_Custom_Partitioner_Extension custom_partitioner{
+    /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
+    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner,
+    /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
+    /*register_custom_partitioner=*/PJRT_Register_Custom_Partitioner,
+};
+
 PJRT_Error* PJRT_Gpu_Register_Custom_Call(
     PJRT_Gpu_Register_Custom_Call_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -228,7 +248,7 @@ const PJRT_Api* GetGpuPjrtApi() {
   static PJRT_Gpu_Custom_Call custom_call{
       /*struct_size=*/PJRT_Gpu_Custom_Call_STRUCT_SIZE,
       /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
-      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&profiler_extension),
+      /*next=*/reinterpret_cast<PJRT_Extension_Base*>(&custom_partitioner),
       /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
   };
   static const PJRT_Api pjrt_api =
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 6d753c5dd4c117..f5583b3878dd12 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -430,8 +430,8 @@ TEST(PjrtCApiGpuExtensionTest, CustomCallTyped) {
       reinterpret_cast<const PJRT_Gpu_Custom_Call*>(next)->custom_call(&args);
 
   CHECK_EQ(error, nullptr);
-  auto* custom_call = xla::ffi::FindHandler(function_name, "CUDA").value();
-  EXPECT_EQ(reinterpret_cast<void*>(custom_call), kNoop);
+  auto registration = xla::ffi::FindHandler(function_name, "CUDA").value();
+  EXPECT_EQ(reinterpret_cast<void*>(registration.handler), kNoop);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index f736d66bfeb4bf..603a8ef30e2dcc 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -1,6 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "if_oss", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -129,9 +130,7 @@ cc_library(
     name = "cpu_client",
     srcs = ["cpu_client.cc"],
     hdrs = ["cpu_client.h"],
-    visibility = [
-        "//xla:friends",
-    ],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":abstract_tfrt_cpu_buffer",
         ":cpu_topology",
@@ -286,3 +285,34 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
     ],
 )
+
+cc_library(
+    name = "mpi_collectives",
+    srcs = if_oss(["mpi_collectives.cc"]),
+    hdrs = if_oss(["mpi_collectives.h"]),
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = if_oss([
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:global_device_id",
+        "//xla/service/cpu:collectives_interface",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@mpitrampoline",
+    ]),
+)
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 6c0bcfc141e5a9..63f930a3c037af 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -363,14 +363,23 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
       std::move(options.collectives), num_threads));
 }
 
+static tsl::ThreadOptions GetThreadOptions() {
+  tsl::ThreadOptions thread_options;
+  // On Mac OS the default stack size is 512KiB, which is too small for some
+  // BLAS and LAPACK functions (https://github.com/google/jax/issues/20428).
+  thread_options.stack_size = 2 * 1024 * 1024;
+  return thread_options;
+}
+
 TfrtCpuClient::TfrtCpuClient(
     int process_index, std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
     std::shared_ptr<cpu::CollectivesInterface> collectives, size_t num_threads)
     : process_index_(process_index),
       owned_devices_(std::move(devices)),
       computation_placer_(std::make_unique<ComputationPlacer>()),
-      pjrt_client_thread_pool_(new tsl::thread::ThreadPool(
-          tsl::Env::Default(), "XLATfrtCpuClient", num_threads)),
+      pjrt_client_thread_pool_(
+          new tsl::thread::ThreadPool(tsl::Env::Default(), GetThreadOptions(),
+                                      "XLATfrtCpuClient", num_threads)),
       async_work_runner_(std::make_unique<ThreadPoolAsyncWorkRunner>(
           pjrt_client_thread_pool_.get())),
       eigen_intraop_pool_(new tsl::thread::ThreadPool(
diff --git a/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc b/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc
new file mode 100644
index 00000000000000..d2c93fd75450f5
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/mpi_collectives.cc
@@ -0,0 +1,283 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/cpu/mpi_collectives.h"
+
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "mpi.h"  // NOLINT
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/primitive_util.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/status_macros.h"
+#include "xla/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+
+namespace xla::cpu {
+
+absl::StatusOr<MPI_Datatype> PrimitiveTypeToMpiType(
+    PrimitiveType element_type) {
+  switch (element_type) {
+    case S8:
+      return MPI_INT8_T;
+    case U8:
+    case PRED:
+      return MPI_UINT8_T;
+    case S16:
+      return MPI_INT16_T;
+    case U16:
+      return MPI_UINT16_T;
+    case S32:
+      return MPI_INT32_T;
+    case U32:
+      return MPI_UINT32_T;
+    case S64:
+      return MPI_INT64_T;
+    case U64:
+      return MPI_UINT64_T;
+    case F32:
+      return MPI_FLOAT;
+    case F64:
+      return MPI_DOUBLE;
+    case C64:
+      return MPI_C_COMPLEX;
+    case C128:
+      return MPI_C_DOUBLE_COMPLEX;
+    default:
+      // For implementing the reduction of unsupported types
+      // see e.g. https://stackoverflow.com/a/29643391
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Unsupported primitive type for reduction: ",
+          primitive_util::LowercasePrimitiveTypeName(element_type)));
+  }
+}
+
+bool MpiTypeIsComplex(MPI_Datatype type) {
+  return type == MPI_C_COMPLEX || type == MPI_C_DOUBLE_COMPLEX;
+}
+
+absl::StatusOr<MPI_Op> ReductionKindToMpiOp(ReductionKind reduction_kind,
+                                            MPI_Datatype type) {
+  switch (reduction_kind) {
+    case ReductionKind::SUM:
+      return MPI_SUM;
+    case ReductionKind::PRODUCT:
+      return MPI_PROD;
+    case ReductionKind::MIN:
+      if (!MpiTypeIsComplex(type)) {
+        return MPI_MIN;
+      } else {
+        return absl::InvalidArgumentError(
+            "MIN reduction not supported for complex types");
+      }
+    case ReductionKind::MAX:
+      if (!MpiTypeIsComplex(type)) {
+        return MPI_MAX;
+      } else {
+        return absl::InvalidArgumentError(
+            "MAX reduction not supported for complex types");
+      }
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unknown reduction kind: ", reduction_kind));
+  }
+}
+
+static absl::Status MpiErrorToAbslStatus(int error) {
+  if (error != MPI_SUCCESS) {
+    char error_str[MPI_MAX_ERROR_STRING];
+    int len;
+    MPI_Error_string(error, error_str, &len);
+    return absl::UnknownError(absl::StrCat("MPI error: ", error_str));
+  }
+  return absl::OkStatus();
+}
+
+MpiCollectivesCommunicator::MpiCollectivesCommunicator(int color, int key) {
+  MPI_Comm_split(MPI_COMM_WORLD, color, key, &comm_);
+  MPI_Comm_rank(comm_, &mpi_rank_);
+  MPI_Comm_size(comm_, &mpi_size_);
+}
+
+MpiCollectivesCommunicator::~MpiCollectivesCommunicator() {
+  MPI_Comm_free(&comm_);
+};
+
+absl::Status MpiCollectivesCommunicator::AllReduce(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t num_elements, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(element_type));
+  TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
+  return MpiErrorToAbslStatus(MPI_Allreduce(input_buffer, output_buffer,
+                                            num_elements, type, op, comm_));
+}
+
+absl::Status MpiCollectivesCommunicator::CollectivePermute(
+    const RendezvousKey& key, size_t num_bytes, std::optional<int> source_rank,
+    absl::Span<int const> target_ranks, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  int tag = 0;  // TODO come up with better tags.
+
+  const int rank = mpi_rank_;
+
+  std::vector<MPI_Request> requests;
+
+  if (source_rank) {
+    if (*source_rank == rank) {
+      std::memcpy(output_buffer, input_buffer, num_bytes);
+    } else {
+      VLOG(1) << "recv at " << rank << " from " << *source_rank;
+      requests.emplace_back();
+      TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+          MPI_Irecv(output_buffer, num_bytes, MPI_BYTE, *source_rank, tag,
+                    comm_, &requests.back())));
+    }
+  } else {
+    std::memset(output_buffer, 0, num_bytes);
+  }
+
+  for (int target : target_ranks) {
+    if (target != rank) {
+      VLOG(1) << "send from " << rank << " to " << target;
+      requests.emplace_back();
+      TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+          MPI_Isend(input_buffer, num_bytes, MPI_BYTE, target, tag, comm_,
+                    &requests.back())));
+    }
+  }
+
+  for (auto& request : requests) {
+    TF_RETURN_IF_ERROR(
+        MpiErrorToAbslStatus(MPI_Wait(&request, MPI_STATUS_IGNORE)));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status MpiCollectivesCommunicator::AllToAll(
+    const RendezvousKey& key, size_t chunk_bytes,
+    absl::Span<const void* const> input_buffers,
+    absl::Span<void* const> output_buffers, absl::Duration timeout) {
+  // We can't use MPI_Alltoall directly because it assumes that the inputs and
+  // outputs are contiguous. Therefore here we implement it using MPI_Sendrecv.
+
+  int tag = 0;  // TODO use better tags.
+  const int rank = mpi_rank_;
+  const int size = mpi_size_;
+  TF_RET_CHECK(size == input_buffers.size());
+  TF_RET_CHECK(size == output_buffers.size());
+
+  std::memcpy(output_buffers[rank], input_buffers[rank], chunk_bytes);
+
+  for (int i = 1; i < size; i++) {
+    int send_rank = (rank + i) % size;
+    int recv_rank = (rank + size - i) % size;
+    TF_RETURN_IF_ERROR(MpiErrorToAbslStatus(
+        MPI_Sendrecv(input_buffers[send_rank], chunk_bytes, MPI_BYTE, send_rank,
+                     tag, output_buffers[recv_rank], chunk_bytes, MPI_BYTE,
+                     recv_rank, tag, comm_, MPI_STATUS_IGNORE)));
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status MpiCollectivesCommunicator::AllGather(const RendezvousKey& key,
+                                                   size_t chunk_bytes,
+                                                   const void* input_buffer,
+                                                   void* output_buffer,
+                                                   absl::Duration timeout) {
+  return MpiErrorToAbslStatus(MPI_Allgather(input_buffer, chunk_bytes, MPI_BYTE,
+                                            output_buffer, chunk_bytes,
+                                            MPI_BYTE, comm_));
+}
+
+absl::Status MpiCollectivesCommunicator::ReduceScatter(
+    const RendezvousKey& key, ReductionKind reduction_kind,
+    PrimitiveType element_type, size_t chunk_elems, const void* input_buffer,
+    void* output_buffer, absl::Duration timeout) {
+  const int size = mpi_size_;
+  std::vector<int> recvcounts(size, chunk_elems);
+  TF_ASSIGN_OR_RETURN(MPI_Datatype type, PrimitiveTypeToMpiType(element_type));
+  TF_ASSIGN_OR_RETURN(MPI_Op op, ReductionKindToMpiOp(reduction_kind, type));
+  return MpiErrorToAbslStatus(MPI_Reduce_scatter(
+      input_buffer, output_buffer, recvcounts.data(), type, op, comm_));
+}
+
+void MpiCollectives::Init() {
+  int provided;
+  MPI_Init_thread(NULL, NULL, MPI_THREAD_FUNNELED, &provided);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_world_rank_);
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size_);
+  VLOG(1) << "MPI rank=" << mpi_world_rank_ << " size=" << mpi_world_size_;
+}
+
+void MpiCollectives::Finalize() {
+  contexts_.clear();
+  MPI_Finalize();
+}
+
+absl::StatusOr<std::shared_ptr<CollectivesCommunicator>>
+MpiCollectives::GetCommunicator(absl::Span<GlobalDeviceId const> global_devices,
+                                int rank) {
+  int flag;
+  MPI_Is_thread_main(&flag);
+  if (!flag) {
+    return absl::UnknownError(
+        absl::StrCat("MPI: Communicator requested from a thread that is not "
+                     "the one MPI was initialized from. Multiple "
+                     "threads/devices per process are not yet supported."));
+  }
+
+  auto& context = contexts_[std::make_tuple(
+      std::vector<GlobalDeviceId>(global_devices.begin(), global_devices.end()),
+      rank)];
+  if (context) {
+    return context;
+  }
+
+  int color;
+  int key = 0;
+  if (global_devices.size() > 0) {
+    color = static_cast<int>(global_devices.at(0).value());
+    key = rank;
+  } else {
+    color = MPI_UNDEFINED;
+  }
+  context = std::make_shared<MpiCollectivesCommunicator>(color, key);
+  return context;
+}
+
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/mpi_collectives.h b/third_party/xla/xla/pjrt/cpu/mpi_collectives.h
new file mode 100644
index 00000000000000..fdf6ec81b6dc6b
--- /dev/null
+++ b/third_party/xla/xla/pjrt/cpu/mpi_collectives.h
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_CPU_MPI_COLLECTIVES_H_
+#define XLA_PJRT_CPU_MPI_COLLECTIVES_H_
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "mpi.h"  // NOLINT
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/service/collective_ops_utils.h"
+#include "xla/service/cpu/collectives_interface.h"
+#include "xla/service/global_device_id.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::cpu {
+
+class MpiCollectivesCommunicator : public CollectivesCommunicator {
+ public:
+  explicit MpiCollectivesCommunicator(int color, int key);
+  ~MpiCollectivesCommunicator() override;
+
+  absl::Status AllReduce(const RendezvousKey& key, ReductionKind reduction_kind,
+                         PrimitiveType element_type, size_t num_elements,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status CollectivePermute(const RendezvousKey& key, size_t num_bytes,
+                                 std::optional<int> source_rank,
+                                 absl::Span<int const> target_ranks,
+                                 const void* input_buffer, void* output_buffer,
+                                 absl::Duration timeout) override;
+  absl::Status AllToAll(const RendezvousKey& key, size_t chunk_bytes,
+                        absl::Span<const void* const> input_buffers,
+                        absl::Span<void* const> output_buffers,
+                        absl::Duration timeout) override;
+  absl::Status AllGather(const RendezvousKey& key, size_t chunk_bytes,
+                         const void* input_buffer, void* output_buffer,
+                         absl::Duration timeout) override;
+  absl::Status ReduceScatter(const RendezvousKey& key,
+                             ReductionKind reduction_kind,
+                             PrimitiveType element_type, size_t chunk_elems,
+                             const void* input_buffer, void* output_buffer,
+                             absl::Duration timeout) override;
+
+ private:
+  MPI_Comm comm_;
+  int mpi_rank_;
+  int mpi_size_;
+};
+
+class MpiCollectives : public CollectivesInterface {
+ public:
+  /*
+  The user has to explicitly call Init() and Finalize() before and
+  after use.
+  For example, using the Python client, this can be achieved with:
+
+  collectives = xla_client._xla.make_mpi_collectives()
+  collectives.Init()
+  atexit.register(collectives.Finalize)
+  */
+  void Init();
+  void Finalize();
+
+  absl::StatusOr<std::shared_ptr<CollectivesCommunicator>> GetCommunicator(
+      absl::Span<GlobalDeviceId const> global_devices, int rank) override;
+
+ private:
+  absl::Status ExchangeGlobalDeviceIds(
+      absl::Span<GlobalDeviceId const> global_devices, int rank);
+
+  int mpi_world_rank_;
+  int mpi_world_size_;
+  absl::flat_hash_map<std::tuple<std::vector<GlobalDeviceId>, int>,
+                      std::shared_ptr<MpiCollectivesCommunicator>>
+      contexts_;
+};
+
+}  // namespace xla::cpu
+
+#endif  // XLA_PJRT_CPU_MPI_COLLECTIVES_H_
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 2e20fc7b2e8f70..1085bb236703f4 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 licenses(["notice"])
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index a132b76c8548bb..468db43349e461 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -1,10 +1,10 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -26,10 +26,10 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/stream_executor",
         "//xla/stream_executor/integrations:device_mem_allocator",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/framework:bfc_allocator",
         "@local_tsl//tsl/framework:device_id_impl",
-        "@local_tsl//tsl/util:env_var",
     ],
 )
 
@@ -86,6 +86,7 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor/integrations:device_mem_allocator",
         "//xla/stream_executor/integrations:tf_allocator_adapter",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -111,7 +112,6 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:connected_traceme",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
     ] + if_cuda_or_rocm([
         ":nccl_id_store",
         "//xla/service/gpu:gpu_compiler",
@@ -139,6 +139,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:statusor",
         "//xla:test",
+        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:utils",
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/service:gpu_plugin",
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index efd5fbf8c17f60..c9c6fe4da4dcef 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/integrations/device_host_allocator.h"
 #include "xla/stream_executor/integrations/device_mem_allocator.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
 #include "tsl/framework/device_id.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 14696f6b23703d..9dec918f993274 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <string>
 #include <string_view>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -38,8 +39,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
@@ -902,6 +903,8 @@ GetStreamExecutorGpuDeviceAllocator(
                                                   std::move(allocators));
 }
 
+}  // namespace
+
 Status BuildDistributedDevices(
     std::string_view platform_name,
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
@@ -909,8 +912,8 @@ Status BuildDistributedDevices(
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
     gpu::GpuExecutableRunOptions* gpu_executable_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
-    absl::Duration get_local_topology_timeout = absl::Minutes(2),
-    absl::Duration get_global_topology_timeout = absl::Minutes(5)) {
+    absl::Duration get_local_topology_timeout,
+    absl::Duration get_global_topology_timeout) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
   std::string boot_id_str;
@@ -991,18 +994,17 @@ Status BuildDistributedDevices(
   return OkStatus();
 }
 
-}  // namespace
-
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc) {
-  std::string compute_capability;
-#if GOOGLE_CUDA
-  se::CudaComputeCapability cc = desc->cuda_compute_capability();
-  compute_capability =
-      std::to_string(cc.major) + "." + std::to_string(cc.minor);
-#else   // GOOGLE_CUDA
-  compute_capability = desc->rocm_compute_capability().gfx_version();
-#endif  // GOOGLE_CUDA
-  return compute_capability;
+  se::GpuComputeCapability cc = desc->gpu_compute_capability();
+  if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
+    auto nvcc = std::get<se::CudaComputeCapability>(cc);
+    return absl::StrCat(nvcc.major, ".", nvcc.minor);
+  } else if (std::holds_alternative<se::RocmComputeCapability>(cc)) {
+    auto rocmcc = std::get<se::RocmComputeCapability>(cc);
+    return rocmcc.gfx_version();
+  } else {
+    return "unknown";
+  }
 }
 
 StreamExecutorGpuDevice::StreamExecutorGpuDevice(
@@ -1078,6 +1080,8 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
     const GpuClientOptions& options) {
 #if TENSORFLOW_USE_ROCM
   auto pjrt_platform_name = xla::RocmName();
+#elif TENSORFLOW_USE_SYCL
+  auto pjrt_platform_name = xla::SyclName();
 #else   // TENSORFLOW_USE_ROCM
   auto pjrt_platform_name = xla::CudaName();
 #endif  // TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 99b98cabbea8fc..529258e2a90cc0 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -237,6 +237,16 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 std::string MakeComputeCapabilityString(const se::DeviceDescription* desc);
 
+Status BuildDistributedDevices(
+    std::string_view platform_name,
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id, int num_nodes,
+    std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    std::shared_ptr<KeyValueStoreInterface> kv_store, bool enable_mock_nccl,
+    absl::Duration get_local_topology_timeout = absl::Minutes(2),
+    absl::Duration get_global_topology_timeout = absl::Minutes(5));
+
 struct GpuClientOptions {
   GpuAllocatorConfig allocator_config;
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index ecf04945364deb..e7aac5c63b3549 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/utils.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/statusor.h"
@@ -427,6 +428,32 @@ TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
   free(dst);
 }
 
+TEST(StreamExecutorGpuClientTest, CopyRawToHostFuture) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client,
+                          GetStreamExecutorGpuClient(GpuClientOptions()));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+
+  auto dst_promise = xla::PjRtFuture<absl::StatusOr<void*>>::CreatePromise();
+  xla::PjRtFuture<absl::StatusOr<void*>> dst_future(dst_promise);
+
+  TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  buffer->GetReadyFuture().OnReady([dst_promise = std::move(dst_promise),
+                                    size](absl::Status status) mutable {
+    dst_promise.Set(aligned_alloc(size, 0));
+  });
+
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+  TF_EXPECT_OK(result.Await());
+  TF_ASSERT_OK_AND_ASSIGN(auto* dst, dst_future.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+  EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
+
+  free(dst);
+}
+
 TEST(StreamExecutorGpuClientTest, AsyncCopyToDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 79eea0d3ebbd3a..18ad76139a73e8 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -56,7 +56,8 @@ namespace xla {
 namespace {
 
 bool IsGpuClient(const PjRtClient& client) {
-  return client.platform_id() == CudaId() || client.platform_id() == RocmId();
+  return client.platform_id() == CudaId() || client.platform_id() == RocmId() ||
+         client.platform_id() == SyclId();
 }
 
 bool IsSameTopology(const PjRtTopologyDescription& topology1,
diff --git a/third_party/xla/xla/pjrt/host_callback.cc b/third_party/xla/xla/pjrt/host_callback.cc
index 231830991509dc..e59eeb6f1f8153 100644
--- a/third_party/xla/xla/pjrt/host_callback.cc
+++ b/third_party/xla/xla/pjrt/host_callback.cc
@@ -21,6 +21,13 @@ limitations under the License.
 
 namespace xla {
 
+static thread_local int on_send_guard = 0;
+
+void EnterHostCallback() { ++on_send_guard; }
+void LeaveHostCallback() { --on_send_guard; }
+
+bool ThisThreadIsInsideHostCallback() { return on_send_guard > 0; }
+
 Status HostCallbackContext::OnSend(int arg_num,
                                    const PjRtTransferMetadata& metadata,
                                    PjRtChunk data) {
@@ -72,7 +79,10 @@ Status HostCallbackContext::OnSend(int arg_num,
     result_ptrs.push_back(results.back().data());
   }
 
+  EnterHostCallback();
   auto status = host_callback_.callback(result_ptrs.data(), arg_ptrs.data());
+  LeaveHostCallback();
+
   // TODO(chky): Consider populating garbage data in results upon errors.
 
   // Clear the arguments for this invocation. This won't race with next
diff --git a/third_party/xla/xla/pjrt/host_callback.h b/third_party/xla/xla/pjrt/host_callback.h
index a4323a92b9a911..58788c0ee87954 100644
--- a/third_party/xla/xla/pjrt/host_callback.h
+++ b/third_party/xla/xla/pjrt/host_callback.h
@@ -34,6 +34,12 @@ limitations under the License.
 
 namespace xla {
 
+bool ThisThreadIsInsideHostCallback();
+
+void EnterHostCallback();
+
+void LeaveHostCallback();
+
 // A thread-safe queue for passing PjRtChunk objects for e.g. from Send ops to
 // Recv ops.
 class ThreadSafePjRtChunkQueue {
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index d9019c9fd3bc62..664012f46b9780 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/pjrt/mlir_to_hlo.h"
 
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -98,11 +99,9 @@ static void ConvertAttr(
 // explicitly after parsing.
 void ConvertStablehloDenseAttributes(
     mlir::Operation* root_op,
-    llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert) {
+    llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert,
+    std::optional<int64_t> plugin_version) {
   llvm::TypeSwitch<mlir::Operation*>(root_op)
-      .Case([&](mlir::stablehlo::BroadcastOp op) {
-        ConvertAttr(op, "broadcast_sizes", convert);
-      })
       .Case([&](mlir::stablehlo::BroadcastInDimOp op) {
         ConvertAttr(op, "broadcast_dimensions", convert);
       })
@@ -123,23 +122,12 @@ void ConvertStablehloDenseAttributes(
         ConvertAttr(op, "rhs_dilation", convert);
         ConvertAttr(op, "window_reversal", convert);
       })
-      .Case([&](mlir::stablehlo::DynamicSliceOp op) {
-        ConvertAttr(op, "slice_sizes", convert);
-      })
-      .Case([&](mlir::stablehlo::FftOp op) {
-        ConvertAttr(op, "fft_length", convert);
-      })
       .Case([&](mlir::stablehlo::GatherOp op) {
         ConvertAttr(op, "slice_sizes", convert);
       })
       .Case([&](mlir::stablehlo::MapOp op) {
         ConvertAttr(op, "dimensions", convert);
       })
-      .Case([&](mlir::stablehlo::PadOp op) {
-        ConvertAttr(op, "edge_padding_low", convert);
-        ConvertAttr(op, "edge_padding_high", convert);
-        ConvertAttr(op, "interior_padding", convert);
-      })
       .Case([&](mlir::stablehlo::ReduceOp op) {
         ConvertAttr(op, "dimensions", convert);
       })
@@ -149,31 +137,56 @@ void ConvertStablehloDenseAttributes(
         ConvertAttr(op, "base_dilations", convert);
         ConvertAttr(op, "window_dilations", convert);
       })
-      .Case([&](mlir::stablehlo::ReverseOp op) {
-        ConvertAttr(op, "dimensions", convert);
-      })
+
       .Case([&](mlir::stablehlo::SelectAndScatterOp op) {
         ConvertAttr(op, "window_dimensions", convert);
         ConvertAttr(op, "window_strides", convert);
-      })
-      .Case([&](mlir::stablehlo::SliceOp op) {
-        ConvertAttr(op, "start_indices", convert);
-        ConvertAttr(op, "limit_indices", convert);
-        ConvertAttr(op, "strides", convert);
-      })
-      .Case([&](mlir::stablehlo::TransposeOp op) {
-        ConvertAttr(op, "permutation", convert);
       });
+
+  // Use PJRT_API_MINOR 40 from Nov 27, 2023 for Dec 9, 2023 StableHLO changes.
+  // Always run when plugin_value is unset (used for deserialization upgrades)
+  // and only run when plugin version is less than 40 otherwise.
+  if (!plugin_version.has_value() || plugin_version.value() < 40) {
+    // Downgrade slice, dynamic_slice, pad, broadcast, transpose, fft, reverse
+    llvm::TypeSwitch<mlir::Operation*>(root_op)
+        .Case([&](mlir::stablehlo::BroadcastOp op) {
+          ConvertAttr(op, "broadcast_sizes", convert);
+        })
+        .Case([&](mlir::stablehlo::DynamicSliceOp op) {
+          ConvertAttr(op, "slice_sizes", convert);
+        })
+        .Case([&](mlir::stablehlo::FftOp op) {
+          ConvertAttr(op, "fft_length", convert);
+        })
+        .Case([&](mlir::stablehlo::PadOp op) {
+          ConvertAttr(op, "edge_padding_low", convert);
+          ConvertAttr(op, "edge_padding_high", convert);
+          ConvertAttr(op, "interior_padding", convert);
+        })
+        .Case([&](mlir::stablehlo::ReverseOp op) {
+          ConvertAttr(op, "dimensions", convert);
+        })
+        .Case([&](mlir::stablehlo::SliceOp op) {
+          ConvertAttr(op, "start_indices", convert);
+          ConvertAttr(op, "limit_indices", convert);
+          ConvertAttr(op, "strides", convert);
+        })
+        .Case([&](mlir::stablehlo::TransposeOp op) {
+          ConvertAttr(op, "permutation", convert);
+        });
+  }
 }
 
-void DowngradeStablehlo(mlir::ModuleOp module) {
-  module->walk([](mlir::Operation* op) {
-    ConvertStablehloDenseAttributes(op, ArrayToElements);
+void DowngradeStablehlo(mlir::ModuleOp module,
+                        std::optional<int64_t> plugin_version) {
+  module->walk([&](mlir::Operation* op) {
+    ConvertStablehloDenseAttributes(op, ArrayToElements, plugin_version);
   });
 }
 void UpgradeStablehlo(mlir::ModuleOp module) {
   module->walk([](mlir::Operation* op) {
-    ConvertStablehloDenseAttributes(op, ElementsToArray);
+    ConvertStablehloDenseAttributes(op, ElementsToArray,
+                                    /*plugin_version=*/std::nullopt);
   });
 }
 
@@ -187,8 +200,7 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
     mlir::PassManager pm(module->getContext());
     pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
     pm.addNestedPass<mlir::func::FuncOp>(
-        mlir::mhlo::createChloLegalizeToHloPass(
-            /*legalizeBroadcasts=*/true, /*expandCompositions=*/true));
+        mlir::mhlo::createChloLegalizeToHloPass());
     pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
     // In order to export to XLA, we must sink constants to control flow
     // regions, since XLA uses functional control flow.
@@ -264,7 +276,8 @@ Status ParseMlirModuleStringAndConvertToXlaComputation(
                                    return_tuple);
 }
 
-StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp module) {
+absl::StatusOr<std::string> SerializeUsingNativeBytecode(
+    mlir::ModuleOp module, std::optional<int64_t> plugin_version) {
   std::string bytecode;
   llvm::raw_string_ostream os(bytecode);
   mlir::BytecodeWriterConfig config;
@@ -280,21 +293,24 @@ StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp module) {
   // deserializing.
   // TODO: b/320507168 - Remove this conversion code.
   mlir::OwningOpRef<mlir::ModuleOp> cloned = module.clone();
-  DowngradeStablehlo(*cloned);
+  DowngradeStablehlo(*cloned, plugin_version);
   if (mlir::failed(mlir::writeBytecodeToFile(*cloned, os, config))) {
     return absl::InvalidArgumentError("mlir::writeBytecodeToFile failed");
   }
   return bytecode;
 }
 
-StatusOr<std::string> SerializeUsingVersionedStablehlo(
+absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
     mlir::ModuleOp mlir_module, absl::string_view target, bool inplace) {
-  // Legalize CHLO -> [MHLO+Shape] -> StableHLO
+  // Legalize CHLO -> [StableHLO+Shape] -> StableHLO
+  // Preserve higher-level ops with XLA support. To be replaced by composites.
   mlir::PassManager pm(mlir_module->getContext());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createChloLegalizeToHloPass());
+      mlir::mhlo::createChloLegalizeToHighLevelMhloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createChloLegalizeToStablehloPass());
   pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::mhlo::createShapeLegalizeToHloPass());
+      mlir::stablehlo::createShapeLegalizeToStablehloPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
   if (!mlir::succeeded(pm.run(mlir_module))) {
@@ -302,7 +318,7 @@ StatusOr<std::string> SerializeUsingVersionedStablehlo(
   }
 
   // Avoid mutating the original module if it will be reused elsewhere
-  mlir::OwningOpRef<mlir::ModuleOp> cloned = mlir_module.clone();
+  mlir::OwningOpRef<mlir::ModuleOp> cloned;
   if (!inplace) {
     cloned = mlir_module.clone();
     mlir_module = *cloned;
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index aacaa1f7b87e4e..9bad5399717422 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -40,7 +40,8 @@ Status ParseMlirModuleStringAndConvertToXlaComputation(
 // Serialize using MLIR Bytecode Format which does not guarantee forward or
 // backward compatiblity of the dialects used. If passing StableHLO with forward
 // or backward compatibility requirements, use SerializeUsingVersionedStablehlo.
-StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp mlir_module);
+absl::StatusOr<std::string> SerializeUsingNativeBytecode(
+    mlir::ModuleOp mlir_module, std::optional<int64_t> plugin_version);
 
 // Serializes an MLIR module to a portable artifact with forward and backward
 // compatibility. Supports modules using StableHLO/MHLO/CHLO/Func dialects.
@@ -52,7 +53,7 @@ StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp mlir_module);
 // Ideally should be the `mlir::stablehlo::getCurrentVersion()` of the plugin.
 // If program contains dialects that aren't supposed in StableHLO portable
 // artifacts, use SerializeUsingNativeBytecode.
-StatusOr<std::string> SerializeUsingVersionedStablehlo(
+absl::StatusOr<std::string> SerializeUsingVersionedStablehlo(
     mlir::ModuleOp mlir_module, absl::string_view target, bool inplace = false);
 
 // Given a module that might be a portable artifact, deserialize and upgrade it
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 9f892640dd2bd4..8cea8ef39858f1 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -393,8 +394,11 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
     mlir::ModuleOp module, CompileOptions options) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      xla::SerializeUsingNativeBytecode(module));
+  if (!pjrt_c_api()) llvm::report_fatal_error("pjrt_c_api is null");
+  TF_ASSIGN_OR_RETURN(
+      std::string serialized,
+      xla::SerializeUsingNativeBytecode(
+          module, plugin_attributes()->pjrt_c_api_minor_version));
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompile(this, c_api_, c_client_.get(), options,
                                   serialized, format);
@@ -2191,8 +2195,12 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
     CompileOptions options, mlir::ModuleOp module,
     const PjRtTopologyDescription& topology, PjRtClient* client) {
   // TODO: Once plugins are ready, use SerializeUsingVersionedStablehlo.
-  TF_ASSIGN_OR_RETURN(std::string serialized,
-                      xla::SerializeUsingNativeBytecode(module));
+  std::optional<int64_t> plugin_version;
+  if (client) {
+    plugin_version = client->plugin_attributes()->pjrt_c_api_minor_version;
+  }
+  TF_ASSIGN_OR_RETURN(std::string serialized, xla::SerializeUsingNativeBytecode(
+                                                  module, plugin_version));
   std::string format(pjrt::kMlirFormat);
   return InitializeArgsAndCompileAot(c_api_, client, options, topology,
                                      serialized, format);
diff --git a/third_party/xla/xla/pjrt/pjrt_compiler.h b/third_party/xla/xla/pjrt/pjrt_compiler.h
index f94769093692aa..d624fd0cf99cd0 100644
--- a/third_party/xla/xla/pjrt/pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/pjrt_compiler.h
@@ -44,6 +44,10 @@ inline const char* RocmName() {
   static constexpr char kRocmName[] = "rocm";
   return kRocmName;
 }
+inline const char* SyclName() {
+  static constexpr char kSyclName[] = "sycl";
+  return kSyclName;
+}
 inline const char* TpuName() {
   static constexpr char kTpuName[] = "tpu";
   return kTpuName;
@@ -60,6 +64,10 @@ inline PjRtPlatformId RocmId() {
   static const PjRtPlatformId kRocmId = tsl::Fingerprint64(RocmName());
   return kRocmId;
 }
+inline PjRtPlatformId SyclId() {
+  static const PjRtPlatformId kSyclId = tsl::Fingerprint64(SyclName());
+  return kSyclId;
+}
 inline PjRtPlatformId TpuId() {
   static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
   return kTpuId;
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.cc b/third_party/xla/xla/pjrt/pjrt_executable.cc
index 9eeb5a8f39842d..4c4eae38f0cbc9 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.cc
+++ b/third_party/xla/xla/pjrt/pjrt_executable.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/pjrt/compile_options.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
@@ -321,7 +322,8 @@ PjRtExecutable::GetOutputDimensions() const {
   return output_dimensions;
 }
 
-StatusOr<std::vector<Layout>> PjRtExecutable::GetParameterLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PjRtExecutable::GetParameterLayouts() const {
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
                       GetHloModules());
   if (hlo_modules.size() > 1) {
@@ -335,10 +337,18 @@ StatusOr<std::vector<Layout>> PjRtExecutable::GetParameterLayouts() const {
         "from executable.");
   }
   ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
-  return comp_layout.FlattenedParameterLayouts();
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> layouts,
+                      comp_layout.FlattenedParameterLayouts());
+  std::vector<std::unique_ptr<PjRtLayout>> result;
+  result.reserve(layouts.size());
+  for (const Layout& layout : layouts) {
+    result.push_back(std::make_unique<PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
-StatusOr<std::vector<Layout>> PjRtExecutable::GetOutputLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PjRtExecutable::GetOutputLayouts() const {
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
                       GetHloModules());
   if (hlo_modules.size() > 1) {
@@ -352,7 +362,14 @@ StatusOr<std::vector<Layout>> PjRtExecutable::GetOutputLayouts() const {
         "from executable.");
   }
   ComputationLayout comp_layout = hlo_modules[0]->entry_computation_layout();
-  return comp_layout.FlattenedResultLayouts();
+  TF_ASSIGN_OR_RETURN(std::vector<Layout> layouts,
+                      comp_layout.FlattenedResultLayouts());
+  std::vector<std::unique_ptr<PjRtLayout>> result;
+  result.reserve(layouts.size());
+  for (const Layout& layout : layouts) {
+    result.push_back(std::make_unique<PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
 StatusOr<absl::flat_hash_map<std::string, PjRtValueType>>
diff --git a/third_party/xla/xla/pjrt/pjrt_executable.h b/third_party/xla/xla/pjrt/pjrt_executable.h
index 1631f595e315d5..e1a6c6f22af96a 100644
--- a/third_party/xla/xla/pjrt/pjrt_executable.h
+++ b/third_party/xla/xla/pjrt/pjrt_executable.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/pjrt/executable_metadata.pb.h"
 #include "xla/pjrt/execute_options.pb.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/service/compiler.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -352,10 +353,12 @@ class PjRtExecutable {
   GetOutputDimensions() const;
 
   // Returns the layout of each input parameter.
-  virtual StatusOr<std::vector<Layout>> GetParameterLayouts() const;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+  GetParameterLayouts() const;
 
   // Returns the layout of each output.
-  virtual StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+  virtual absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+  GetOutputLayouts() const;
 
   // Returns a list of lists of memory kind strings for output. The returned
   // value is `[num_programs, num_output]`. The size of the outer list should be
diff --git a/third_party/xla/xla/pjrt/pjrt_layout.h b/third_party/xla/xla/pjrt/pjrt_layout.h
index a3c080797922f1..0fbf205de8b6e7 100644
--- a/third_party/xla/xla/pjrt/pjrt_layout.h
+++ b/third_party/xla/xla/pjrt/pjrt_layout.h
@@ -62,7 +62,11 @@ class PjRtLayout {
 // have access to full xla::Layouts.
 class PjRtXlaLayout : public PjRtLayout {
  public:
-  explicit PjRtXlaLayout(Layout layout) : xla_layout_(std::move(layout)) {}
+  explicit PjRtXlaLayout(Layout layout) : xla_layout_(std::move(layout)) {
+    // Strip memory space and set it to the default. PJRT tracks memory space
+    // separately from layout.
+    xla_layout_.set_memory_space(xla::Layout::kDefaultMemorySpace);
+  }
 
   std::string Serialize() const override { return xla_layout_.ToString(); }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 9275f94492133d..83261bcd7f98a6 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -103,6 +103,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/event_pool.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/metrics.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -616,9 +617,7 @@ void PjRtStreamExecutorBuffer::ScopedHold::AddToInput(
   }
 }
 
-bool PjRtStreamExecutorBuffer::IsOnCpu() const {
-  return client()->platform_id() == CpuId();
-}
+bool PjRtStreamExecutorBuffer::IsOnCpu() const { return false; }
 
 StatusOr<Shape> PjRtStreamExecutorBuffer::logical_on_device_shape() {
   if (on_device_shape_.is_static()) {
@@ -827,59 +826,6 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
       ShapeUtil::ByteStrides(device_shape, absl::MakeSpan(shape_strides)));
   bool host_and_device_strides_equal =
       (size == 0 || *byte_strides == shape_strides);
-  // The CPU platform is special because the "host" and the "device" are in the
-  // same memory space. If the input shape is in the correct layout and we don't
-  // want to defer the copy onto a thread, we can use the following fast
-  // path.
-  bool is_cpu_platform =
-      local_device->executor()->platform()->id() == se::host::kHostPlatformId;
-  if (is_cpu_platform) {
-    // If we are on the host platform and the input buffer is sufficiently
-    // aligned, we can simply point to the input array's data without any
-    // further copies. At the time of writing we require a 16-byte alignment
-    // because XLA may generate code which requires it.
-    bool can_use_zero_copy =
-        host_buffer_semantics == HostBufferSemantics::kZeroCopy &&
-        ((absl::bit_cast<std::uintptr_t>(data) &
-          (cpu_function_runtime::MinAlign() - 1)) == 0);
-    if (host_and_device_strides_equal &&
-        (host_buffer_semantics ==
-             HostBufferSemantics::kImmutableOnlyDuringCall ||
-         can_use_zero_copy)) {
-      absl::AnyInvocable<void() &&> on_delete_callback;
-      se::DeviceMemoryBase buffer;
-      // If we are on the host platform and the input buffer is sufficiently
-      // aligned, we can simply point to the input array's data without any
-      // further copies. At the time of writing we require a 16-byte alignment
-      // because XLA may generate code which requires it.
-      if (can_use_zero_copy) {
-        on_delete_callback = std::move(on_done_with_host_buffer);
-        buffer = se::DeviceMemoryBase(
-            const_cast<void*>(static_cast<const void*>(data)), size);
-      } else {
-        void* staging_buffer = host_memory_allocator()->AllocateRaw(
-            cpu_function_runtime::MinAlign(), size);
-        buffer = se::DeviceMemoryBase(staging_buffer, size);
-        std::memcpy(staging_buffer, data, size);
-        if (on_done_with_host_buffer) {
-          std::move(on_done_with_host_buffer)();
-        }
-        on_delete_callback = [staging_buffer, host_memory_allocator =
-                                                  host_memory_allocator()]() {
-          host_memory_allocator->DeallocateRaw(staging_buffer);
-        };
-      }
-      absl::Span<const std::shared_ptr<BufferSequencingEvent>>
-          definition_events;
-      auto device_buffer = std::make_shared<TrackedDeviceBuffer>(
-          /*allocator=*/nullptr, local_device->local_device_id().value(),
-          std::initializer_list<se::DeviceMemoryBase>{buffer},
-          definition_events, std::move(on_delete_callback));
-      return std::unique_ptr<PjRtBuffer>(
-          std::make_unique<PjRtStreamExecutorBuffer>(
-              device_shape, std::move(device_buffer), this, device));
-    }
-  }
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<PjRtStreamExecutorBuffer> py_buffer,
@@ -905,8 +851,10 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
 
+  bool should_pack =
+      primitive_util::Is4BitType(type) && transfer_manager->PackSubbyteTypes();
   int64_t packed_size;
-  if (primitive_util::Is4BitType(type)) {
+  if (should_pack) {
     packed_size = CeilOfRatio<int64_t>(size, 2);
   } else {
     packed_size = size;
@@ -938,14 +886,14 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   if (host_buffer_semantics == HostBufferSemantics::kImmutableOnlyDuringCall) {
     if (transpose) {
       transpose->Execute(data, staging_buffer.get());
-      if (primitive_util::Is4BitType(type)) {
+      if (should_pack) {
         PackInt4(absl::MakeConstSpan(
                      static_cast<const char*>(staging_buffer.get()), size),
                  absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
                                 packed_size));
       }
     } else {
-      if (primitive_util::Is4BitType(type)) {
+      if (should_pack) {
         PackInt4(absl::MakeConstSpan(static_cast<const char*>(data), size),
                  absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
                                 packed_size));
@@ -968,7 +916,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
   auto transfer_h2d =
       [local_client = client(), transfer_manager, local_device, data, size,
        type, packed_size, movable_device_buffer{device_buffer.ToClosure()},
-       device_shape, py_buffer{py_buffer.get()},
+       device_shape, should_pack, py_buffer{py_buffer.get()},
        on_device_shape{py_buffer->on_device_shape()},
        staging_buffer{std::move(staging_buffer)},
        on_done_with_host_buffer =
@@ -997,7 +945,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
               HostBufferSemantics::kImmutableOnlyDuringCall) {
             if (transpose) {
               transpose->Execute(data, staging_buffer.get());
-              if (primitive_util::Is4BitType(type)) {
+              if (should_pack) {
                 PackInt4(
                     absl::MakeConstSpan(
                         static_cast<const char*>(staging_buffer.get()), size),
@@ -1005,7 +953,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
                                    packed_size));
               }
             } else {
-              if (primitive_util::Is4BitType(type)) {
+              if (should_pack) {
                 PackInt4(
                     absl::MakeConstSpan(static_cast<const char*>(data), size),
                     absl::MakeSpan(static_cast<char*>(staging_buffer.get()),
@@ -1038,13 +986,7 @@ PjRtStreamExecutorClient::BufferFromHostBuffer(
               }
             }));
       };
-  if (is_cpu_platform) {
-    // Using the thread_pool would be a double thread hop; the code
-    // already defers its work onto a stream (= thread on CPU).
-    transfer_h2d();
-  } else {
-    thread_pool()->Schedule(transfer_h2d);
-  }
+  thread_pool()->Schedule(transfer_h2d);
   return std::unique_ptr<PjRtBuffer>(std::move(py_buffer));
 }
 
@@ -1686,6 +1628,29 @@ PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHost(
   return client_->CopyRawSubBufferToHost(this, dst, offset, transfer_size);
 }
 
+PjRtFuture<Status> PjRtStreamExecutorBuffer::CopyRawToHostFuture(
+    PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
+  auto promise = PjRtFuture<Status>::CreatePromise();
+  dst.OnReady([this, promise, offset,
+               transfer_size](absl::StatusOr<void*> dst) mutable {
+    if (dst.ok()) {
+      // Trampoline through a thread pool since some device types (e.g., GPUs)
+      // do not allow calling D2H inside the callback's context.
+      client_->thread_pool()->Schedule(
+          [this, dst = *dst, offset, transfer_size,
+           promise = std::move(promise)]() mutable {
+            CopyRawToHost(dst, offset, transfer_size)
+                .OnReady([promise = std::move(promise)](Status status) mutable {
+                  promise.Set(status);
+                });
+          });
+    } else {
+      promise.Set(dst.status());
+    }
+  });
+  return PjRtFuture<Status>(std::move(promise));
+}
+
 StatusOr<ShapedBuffer> PjRtStreamExecutorBuffer::AsShapedBuffer() const {
   absl::MutexLock lock(&mu_);
   if (device_buffer_ == nullptr) {
@@ -2934,7 +2899,7 @@ PjRtStreamExecutorLoadedExecutable::Execute(
           << " num_partitions=" << num_partitions()
           << " num_addressable_devices=" << num_addressable_devices;
   std::vector<StatusOr<Result>> results(num_addressable_devices);
-  if (num_addressable_devices == 1) {
+  if (num_addressable_devices == 1 && !ThisThreadIsInsideHostCallback()) {
     // Fast-path if there is only one device — run the computation on the
     // current thread.
     const int replica = addressable_device_logical_ids_[0].replica;
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index dbac8c4c2cb87b..a94d3e28bf4345 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -676,6 +676,10 @@ class PjRtStreamExecutorBuffer : public PjRtBuffer {
   PjRtFuture<Status> CopyRawToHost(void* dst, int64_t offset,
                                    int64_t transfer_size) override;
 
+  PjRtFuture<Status> CopyRawToHostFuture(PjRtFuture<StatusOr<void*>> dst,
+                                         int64_t offset,
+                                         int64_t transfer_size) override;
+
   // Drops the buffer's reference to its associated device memory, leaving the
   // buffer in an invalid state. The memory will be freed lazily when all async
   // operations using the buffer have completed, according to the allocation
diff --git a/third_party/xla/xla/pjrt/status_casters.h b/third_party/xla/xla/pjrt/status_casters.h
index 72f89e4cbb9924..915153f9c372c9 100644
--- a/third_party/xla/xla/pjrt/status_casters.h
+++ b/third_party/xla/xla/pjrt/status_casters.h
@@ -103,7 +103,7 @@ struct ThrowIfErrorWrapper<xla::Status(Args...), xla::Status (&)(Args...)> {
 template <typename C, typename... Args, typename F>
 struct ThrowIfErrorWrapper<xla::Status (C::*)(Args...), F> {
   explicit ThrowIfErrorWrapper(F&& f) : func(std::move(f)) {}
-  void operator()(Args... args) {
+  void operator()(Args... args) const {
     xla::ThrowIfError(func(std::forward<Args>(args)...));
   }
   F func;
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index 508a1a82233fe7..660ef5ecb87adb 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -1,12 +1,5 @@
-load("//xla:pytype.default.bzl", "pytype_strict_library")
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-    "xla_py_test_deps",
-)
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -20,6 +13,13 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+    "xla_py_test_deps",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -40,6 +40,7 @@ package_group(
 pytype_strict_library(
     name = "xla_client",
     srcs = ["xla_client.py"],
+    pytype_srcs = ["xla_client.pyi"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
@@ -196,6 +197,7 @@ cc_library(
         "//xla/pjrt:exceptions",
         "//xla/python/ifrt",
         "//xla/python/pjrt_ifrt",
+        "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status:statusor",
@@ -204,9 +206,6 @@ cc_library(
         "@local_config_python//:python_headers",  # buildcleaner: keep
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ],
 )
 
@@ -226,23 +225,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@pybind11",
-    ],
-)
-
-cc_library(
-    name = "python_utils",
-    hdrs = ["python_utils.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    deps = [
-        "//xla:status_macros",
-        "//xla:util",
         "@local_config_python//:python_headers",  # buildcleaner: keep
-        "@pybind11",
     ],
 )
 
@@ -301,9 +284,11 @@ cc_library(
         "py_array.cc",
         "py_client.cc",
         "py_compile_only_client.cc",
+        "py_device.cc",
         "py_device_list.cc",
         "py_executable.cc",
         "py_host_callback.cc",
+        "py_memory_space.cc",
         "py_values.cc",
         "sharding.cc",
     ],
@@ -311,9 +296,11 @@ cc_library(
         "py_array.h",
         "py_client.h",
         "py_compile_only_client.h",
+        "py_device.h",
         "py_device_list.h",
         "py_executable.h",
         "py_host_callback.h",
+        "py_memory_space.h",
         "py_values.h",
         "sharded_device_array.h",
         "sharding.h",
@@ -329,6 +316,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         ":callback",
+        ":nb_absl_span",
         ":nb_class_ptr",
         ":nb_helpers",
         ":nb_numpy",
@@ -336,7 +324,6 @@ cc_library(
         ":py_client_gpu",
         ":py_host_callback_proto_cc",
         ":python_ref_manager",
-        ":python_utils",
         ":traceback",
         ":transfer_guard_lib",
         ":types",
@@ -375,6 +362,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
         "//xla/pjrt:pjrt_layout",
@@ -385,10 +373,13 @@ cc_library(
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
         "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:platform_util",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
@@ -397,12 +388,9 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor/cuda:cuda_driver",
@@ -431,6 +419,7 @@ cc_library(
         "//third_party/nanobind",
         "//xla:comparison_util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt:host_callback",
         "//xla/pjrt:transpose",
         "//xla/service:custom_call_status",
         "@com_google_absl//absl/base",
@@ -468,6 +457,7 @@ cc_library(
         "//third_party/nanobind",
         "//xla:comparison_util",
         "//xla/pjrt:exceptions",
+        "//xla/pjrt:host_callback",
         "//xla/service:custom_call_status",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
@@ -490,6 +480,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":nb_class_ptr",
         ":py_client",
         ":python_ref_manager",
         ":traceback",
@@ -515,7 +506,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@pybind11",
     ],
 )
 
@@ -548,8 +538,6 @@ cc_library(
         "@local_config_python//:python_headers",  # build_cleaner: keep
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
-        "//xla/python/ifrt",
-        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -578,6 +566,7 @@ cc_library(
         "//xla/client:xla_computation",
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:mlir_to_hlo",
+        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_helpers",
         "//xla/service:call_inliner",
@@ -611,11 +600,14 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "//third_party/nanobind",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
         "//xla/pjrt:status_casters",
-        "@local_tsl//tsl/platform:errors",
+        "//xla/pjrt/c:pjrt_c_api_custom_partitioner_extension_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_hdrs",
+        "//xla/pjrt/c:pjrt_c_api_helpers",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -734,6 +726,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":jax_jit",
+        ":nb_class_ptr",
         ":nb_helpers",
         ":nb_numpy",
         ":py_client",
@@ -743,6 +736,7 @@ cc_library(
         ":types",
         # placeholder for index annotation deps
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -757,15 +751,13 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:exceptions",
         "//xla/pjrt:pjrt_client",
-        "//xla/pjrt:pjrt_future",
         "//xla/pjrt:status_casters",
         "//xla/python/ifrt",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
     ],
 )
 
@@ -799,6 +791,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":nb_class_ptr",
         ":outfeed_receiver",
         ":py_client",
         ":types",
@@ -814,7 +807,6 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:status_casters",
         "@local_tsl//tsl/platform:logging",
-        "@pybind11",
     ],
 )
 
@@ -863,6 +855,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "//third_party/nanobind",
+        "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla/pjrt:exceptions",
         "@local_tsl//tsl/platform:logging",
     ],
@@ -918,6 +911,7 @@ cc_library(
         "//xla/mlir/utils:error_util",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeWriter",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
@@ -959,6 +953,7 @@ cc_library(
         "//xla/pjrt/c:pjrt_c_api_hdrs",
         "//xla/pjrt/c:pjrt_c_api_profiler_extension_hdrs",
         "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/lib:profiler_session",
@@ -1012,7 +1007,6 @@ cc_library(
         "//xla/python/ifrt",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
-        "@pybind11",
     ],
 )
 
@@ -1097,7 +1091,6 @@ cc_library(
         "//xla/service:tuple_simplifier",
         "@local_tsl//tsl/lib/strings:proto_serialization",
         "@local_tsl//tsl/platform:logging",
-        "@pybind11",
     ],
 )
 
@@ -1152,9 +1145,27 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "logging",
+    srcs = ["logging.cc"],
+    hdrs = ["logging.h"],
+    deps = [
+        "@com_google_absl//absl/log:initialize",
+    ],
+)
+
 tsl_pybind_extension(
     name = "xla_extension",
-    srcs = ["xla_extension.cc"],
+    srcs = ["xla.cc"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    defines = select({
+        ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
+        "//conditions:default": [],
+    }),
+    features = ["-use_header_modules"],
     linkopts = select({
         ":use_jax_cuda_pip_rpaths": [
             "-Wl,-rpath,$$ORIGIN/../nvidia/cuda_cupti/lib",
@@ -1172,39 +1183,15 @@ tsl_pybind_extension(
     ],
     pytype_srcs = glob(["xla_extension/*.pyi"]),
     visibility = ["//visibility:public"],
-    deps = [
-        ":xla_extension_library",
-        "@pybind11",
-    ],
-)
-
-cc_library(
-    name = "xla_extension_library",
-    srcs = [
-        "logging.cc",
-        "logging.h",
-        "xla.cc",
-    ],
-    hdrs = [
-        "xla.h",
-    ],
-    compatible_with = [],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    defines = select({
-        ":gpu_enabled": ["XLA_PYTHON_ENABLE_GPU=1"],
-        "//conditions:default": [],
-    }),
-    features = ["-use_header_modules"],
     deps = [
         ":custom_call_sharding",
         ":dlpack",
         ":jax_jit",
+        ":logging",
         ":mlir",
         ":nb_absl_flat_hash_map",
         ":nb_absl_span",
+        ":nb_class_ptr",
         ":ops",
         ":outfeed_receiver_py",
         ":pjit",
@@ -1226,6 +1213,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:initialize",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -1257,16 +1245,17 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:service",
+        "//xla/pjrt/gpu:gpu_helpers",
         "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/client:py_module",
         "//xla/python/pjrt_ifrt",
         "//xla/service/cpu:collectives_interface",
+        "//xla/tsl/python/lib/core:numpy",
         "@local_tsl//tsl/distributed_runtime/preemption:preemption_sync_manager",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform/cloud:gcs_file_system",
-        "@local_tsl//tsl/python/lib/core:numpy",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + select({
         # gloo transport only builds on linux
         "@local_tsl//tsl:macos": [],
@@ -1276,6 +1265,12 @@ cc_library(
             "//xla/pjrt/cpu:gloo_collectives",
             "//xla/pjrt/cpu:gloo_kv_store",
         ],
+    }) + select({
+        # mpitrampoline does not build on windows
+        "@local_tsl//tsl:windows": [],
+        "//conditions:default": [
+            "//xla/pjrt/cpu:mpi_collectives",
+        ],
     }) + select({
         ":gpu_enabled": [
             "//xla/pjrt/gpu:se_gpu_pjrt_client",
@@ -1351,6 +1346,7 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//third_party/nanobind",
+        "@com_google_absl//absl/strings:str_format",
         "@local_config_python//:python_headers",
     ],
 )
@@ -1364,9 +1360,9 @@ cc_library(
     features = ["-use_header_modules"],
     deps = [
         "//third_party/nanobind",
+        "//xla/tsl/python/lib/core:numpy",
         "@com_google_absl//absl/types:span",
         "@local_config_python//:python_headers",
-        "@local_tsl//tsl/python/lib/core:numpy",
     ],
 )
 
@@ -1393,3 +1389,15 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
+
+cc_library(
+    name = "nb_absl_flat_hash_set",
+    hdrs = ["nb_absl_flat_hash_set.h"],
+    compatible_with = [],
+    copts = ["-fexceptions"],
+    features = ["-use_header_modules"],
+    deps = [
+        "//third_party/nanobind",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
diff --git a/third_party/xla/xla/python/callback.cc b/third_party/xla/xla/python/callback.cc
index e37ca0cf3c7b57..1696af9058bd39 100644
--- a/third_party/xla/xla/python/callback.cc
+++ b/third_party/xla/xla/python/callback.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/primitive_util.h"
 #include "xla/python/nb_numpy.h"
@@ -127,14 +128,16 @@ absl::Status CpuCallback::PrepareAndCall(void* result, void** arg_ptrs) {
 }
 
 absl::StatusOr<nb::tuple> CpuCallback::CallInternal(nb::tuple args) {
+  auto py_error_to_status = [](nb::python_error& e) {
+    std::string error_message = e.what();
+    return absl::InternalError(
+        absl::StrFormat("CpuCallback error: %s", error_message));
+  };
   nb::object result_object;
   try {
     result_object = callable_(*nb::borrow<nb::args>(args));
   } catch (nb::python_error& e) {
-    PyErr_Clear();
-    std::string error_message = e.what();
-    return absl::InternalError(
-        absl::StrFormat("CpuCallback error: %s", error_message));
+    return py_error_to_status(e);
   }
   if (!PyTuple_Check(result_object.ptr())) {
     return absl::InternalError(
@@ -158,7 +161,12 @@ absl::StatusOr<nb::tuple> CpuCallback::CallInternal(nb::tuple args) {
       }
       continue;
     }
-    nb_numpy_ndarray array = nb_numpy_ndarray::ensure(output);
+    nb_numpy_ndarray array;
+    try {
+      array = nb_numpy_ndarray::from_any(output, NPY_ARRAY_ENSUREARRAY);
+    } catch (nb::python_error& e) {
+      return py_error_to_status(e);
+    }
     static_assert(sizeof(ssize_t) == sizeof(int64_t),
                   "Expected ssize_t to be of equal size to int64_t");
     absl::Span<int64_t const> dims(
diff --git a/third_party/xla/xla/python/custom_call_sharding.cc b/third_party/xla/xla/python/custom_call_sharding.cc
index c9056f22465f05..599cb160a9d94d 100644
--- a/third_party/xla/xla/python/custom_call_sharding.cc
+++ b/third_party/xla/xla/python/custom_call_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstring>
+#include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -33,9 +34,14 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
+#include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
+#include "xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/custom_partition_callback.h"
 #include "xla/python/inspect_sharding.h"
 #include "xla/shape.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
@@ -210,15 +216,45 @@ void BuildCustomCallShardingPybindAPI(nb::module_& m) {
       "register_custom_call_partitioner",
       [](std::string name, nb::object prop_user_sharding, nb::object partition,
          nb::object infer_sharding_from_operands,
-         bool can_side_effecting_have_replicated_sharding) {
+         bool can_side_effecting_have_replicated_sharding,
+         std::optional<nb::capsule> c_api) {
         auto* c_fns =
             (new PyCustomCallPartitionerCallbacks(prop_user_sharding, partition,
                                                   infer_sharding_from_operands))
                 ->callbacks();
         c_fns->can_side_effecting_have_replicated_sharding =
             can_side_effecting_have_replicated_sharding;
-        RegisterCustomCallPartitioner(
-            name, jax::CreateCApiCustomCallPartitioner(c_fns));
+        if (!c_api.has_value()) {
+          RegisterCustomCallPartitioner(
+              name, jax::CreateCApiCustomCallPartitioner(c_fns));
+          return;
+        }
+
+        if (std::string_view(c_api->name()) != "pjrt_c_api") {
+          throw absl::InvalidArgumentError(
+              "Argument to register_custom_call_partitioner was not a "
+              "pjrt_c_api capsule.");
+        }
+        auto* c_api_value = static_cast<const PJRT_Api*>(c_api->data());
+        PJRT_Custom_Partitioner_Extension* extension =
+            pjrt::FindExtension<PJRT_Custom_Partitioner_Extension>(
+                c_api_value,
+                PJRT_Extension_Type::PJRT_Extension_Type_Custom_Partitioner);
+        if (extension == nullptr) {
+          return;
+        }
+        PJRT_Register_Custom_Partitioner_Args args;
+        args.struct_size = PJRT_Register_Custom_Partitioner_Args_STRUCT_SIZE;
+        args.name = name.c_str();
+        args.name_size = name.size();
+        args.callbacks = c_fns;
+        PJRT_Error* error =
+            reinterpret_cast<const PJRT_Custom_Partitioner_Extension*>(
+                extension)
+                ->register_custom_partitioner(&args);
+        std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error_ptr(
+            error, pjrt::MakeErrorDeleter(c_api_value));
+        ThrowIfError(pjrt::PjrtErrorToStatus(error_ptr.get(), c_api_value));
       },
       R"(Registers a partitioner for a custom-call operation.
 
@@ -233,10 +269,13 @@ void BuildCustomCallShardingPybindAPI(nb::module_& m) {
      Takes operand sharding and returns the instruction sharding.
   can_side_effecting_have_replicated_sharding: Side effecting ops are not
      allowed to have replicated sharding. Pass true to disable this check.
+  c_api: Optional `PJRT_Api*` if it is called with a plugin. This is safe to
+     call on plugins that do not implement the custom partitioner extension
 )",
       nb::arg("name"), nb::arg("prop_user_sharding"), nb::arg("partition"),
       nb::arg("infer_sharding_from_operands"),
-      nb::arg("can_side_effecting_have_replicated_sharding") = false);
+      nb::arg("can_side_effecting_have_replicated_sharding") = false,
+      nb::arg("c_api").none() = std::nullopt);
   m.def("encode_inspect_sharding_callback",
         [](nb::object handler) -> nb::bytes {
           JAX_InspectSharding_Callback cb;
diff --git a/third_party/xla/xla/python/custom_partition_callback.cc b/third_party/xla/xla/python/custom_partition_callback.cc
index b37c6d03b4c729..d9bcb596bf5999 100644
--- a/third_party/xla/xla/python/custom_partition_callback.cc
+++ b/third_party/xla/xla/python/custom_partition_callback.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/mlir_to_hlo.h"
 #include "xla/service/call_inliner.h"
diff --git a/third_party/xla/xla/python/custom_partition_callback.h b/third_party/xla/xla/python/custom_partition_callback.h
index d026f1f5cbb59b..33cc31e75fc9bf 100644
--- a/third_party/xla/xla/python/custom_partition_callback.h
+++ b/third_party/xla/xla/python/custom_partition_callback.h
@@ -24,91 +24,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
+#include "xla/pjrt/c/pjrt_c_api_custom_partitioner_extension.h"
 #include "xla/service/custom_call_sharding_helper.h"
 
-extern "C" {
-
-struct JAX_CustomCallPartitioner_string {
-  const char* data;
-  size_t size;
-};
-
-struct JAX_CustomCallPartitioner_aval {
-  JAX_CustomCallPartitioner_string shape;
-  bool has_sharding;
-  JAX_CustomCallPartitioner_string sharding;
-};
-
-// General callback information containing api versions, the result error
-// message and the cleanup function to free any temporary memory that is backing
-// the results. Arguments are always owned by the caller, and results are owned
-// by the cleanup_fn. These should never be used directly. Args and results
-// should be serialized via the PopulateArgs, ReadArgs, PopulateResults,
-// ConsumeResults functions defined below.
-struct JAX_CustomCallPartitioner_version_and_error {
-  int64_t api_version;
-  void* data;  // out
-  // cleanup_fn cleans up any returned results. The caller must finish with all
-  // uses by the point the cleanup is called.
-  void (*cleanup_fn)(void* data);  // out
-  bool has_error;
-  PJRT_Error_Code code;                        // out
-  JAX_CustomCallPartitioner_string error_msg;  // out
-};
-
-struct JAX_CustomCallPartitioner_Partition_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  size_t num_args;
-  JAX_CustomCallPartitioner_aval* op_args;
-  JAX_CustomCallPartitioner_aval op_result;
-  JAX_CustomCallPartitioner_string backend_config;
-
-  // out
-  JAX_CustomCallPartitioner_string mlir_module;
-  JAX_CustomCallPartitioner_string* args_sharding;
-  JAX_CustomCallPartitioner_string result_sharding;
-};
-
-struct JAX_CustomCallPartitioner_InferShardingFromOperands_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  size_t num_args;
-  JAX_CustomCallPartitioner_aval* op_args;
-  JAX_CustomCallPartitioner_string result_shape;
-  JAX_CustomCallPartitioner_string backend_config;
-
-  bool has_result_sharding;
-  JAX_CustomCallPartitioner_string result_sharding;
-};
-
-struct JAX_CustomCallPartitioner_PropagateUserSharding_Args {
-  JAX_CustomCallPartitioner_version_and_error header;
-
-  JAX_CustomCallPartitioner_string backend_config;
-
-  JAX_CustomCallPartitioner_string result_shape;
-
-  JAX_CustomCallPartitioner_string result_sharding;  // inout
-};
-
-struct JAX_CustomCallPartitioner_Callbacks {
-  int64_t version;
-  void* private_data;
-  void (*dtor)(JAX_CustomCallPartitioner_Callbacks* data);
-  void (*partition)(JAX_CustomCallPartitioner_Callbacks* data,
-                    JAX_CustomCallPartitioner_Partition_Args* args);
-  void (*infer_sharding)(
-      JAX_CustomCallPartitioner_Callbacks* data,
-      JAX_CustomCallPartitioner_InferShardingFromOperands_Args* args);
-  void (*propagate_user_sharding)(
-      JAX_CustomCallPartitioner_Callbacks* data,
-      JAX_CustomCallPartitioner_PropagateUserSharding_Args* args);
-  bool can_side_effecting_have_replicated_sharding;
-};
-
-}  // extern "C"
-
 namespace jax {
 
 struct PartitionScratch {
diff --git a/third_party/xla/xla/python/dlpack.cc b/third_party/xla/xla/python/dlpack.cc
index 52e02d21c31a1a..0cb187ba27a76a 100644
--- a/third_party/xla/xla/python/dlpack.cc
+++ b/third_party/xla/xla/python/dlpack.cc
@@ -33,14 +33,13 @@ limitations under the License.
 #include "include/dlpack/dlpack.h"  // from @dlpack
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/gil.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/layout.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/py_array.h"
@@ -57,7 +56,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace xla {
 namespace {
@@ -68,7 +66,7 @@ struct DLPackTensor {
   ~DLPackTensor();
 
   // `buffer_reference` is populated if we have shared (read-only) access.
-  py::object buffer_reference;
+  nb::object buffer_reference;
 
   // `external_reference` is always populated.
   std::unique_ptr<PjRtBuffer::ExternalReference> external_reference;
@@ -293,11 +291,9 @@ absl::StatusOr<PjRtDevice*> DeviceForDLDevice(const PjRtClient* cpu_client,
 
 }  // namespace
 
-absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
-    py::handle py_buffer, std::optional<std::intptr_t> stream) {
-  // TODO(phawkins): remove .ptr() when nanobind transition is complete.
-  ifrt::Array* ifrt_array =
-      nb::cast<xla::PyArray>(nb::handle(py_buffer.ptr())).ifrt_array();
+absl::StatusOr<nb::capsule> BufferToDLPackManagedTensor(
+    nb::handle py_buffer, std::optional<std::intptr_t> stream) {
+  ifrt::Array* ifrt_array = nb::cast<xla::PyArray>(py_buffer).ifrt_array();
   if (ifrt_array == nullptr) {
     return Unimplemented(
         "BufferToDLPackManagedTensor called on deleted array.");
@@ -323,7 +319,7 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
   {
     // AcquireExternalReference may block; there are no API guarantees.
     GlobalPyRefManager()->CollectGarbage();
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(pack->external_reference,
                         pjrt_buffer->AcquireExternalReference());
     if (stream) {
@@ -334,7 +330,7 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
           AwaitBuffersReady(absl::MakeConstSpan(&ifrt_array, 1)));
     }
   }
-  pack->buffer_reference = py::reinterpret_borrow<py::object>(py_buffer);
+  pack->buffer_reference = nb::borrow<nb::object>(py_buffer);
 
   dt.data = pack->external_reference->OpaqueDeviceMemoryDataPointer();
   pack->tensor.manager_ctx = pack.get();
@@ -357,29 +353,36 @@ absl::StatusOr<py::capsule> BufferToDLPackManagedTensor(
   dt.strides = reinterpret_cast<std::int64_t*>(pack->strides.data());
   dt.byte_offset = 0;
 
-  py::capsule capsule(&pack.release()->tensor, kDlTensorCapsuleName,
-                      [](PyObject* obj) {
-                        DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(
-                            PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
-                        if (dlmt) {
-                          DLPackTensorDeleter(dlmt);
-                        } else {
-                          // The tensor has been deleted. Clear any error from
-                          // PyCapsule_GetPointer.
-                          PyErr_Clear();
-                        }
-                      });
+  // We cannot use nanobind's capsule object constructor because we need to
+  // detect if the capsule name has been changed in the deleter, but nanobind
+  // hides the underlying Python object from the deleter.
+  nb::capsule capsule = nb::steal<nb::capsule>(
+      PyCapsule_New(&pack.release()->tensor, kDlTensorCapsuleName,
+                    [](PyObject* obj) noexcept {
+                      DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(
+                          PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
+                      if (dlmt) {
+                        DLPackTensorDeleter(dlmt);
+                      } else {
+                        // The tensor has been deleted. Clear any error from
+                        // PyCapsule_GetPointer.
+                        PyErr_Clear();
+                      }
+                    }));
+  if (!capsule.ptr()) {
+    throw nb::python_error();
+  }
   return capsule;
 }
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client) {
+absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
+    const nb::capsule& tensor, std::optional<nb_class_ptr<PyClient>> cpu_client,
+    std::optional<nb_class_ptr<PyClient>> gpu_client) {
   // TODO(hyeontaek): This is a potential target for an IFRT client to multiplex
   // multiple PjRt clients. Devices from these PjRt clients could be expressed
   // as a unified set of IFRT devices.
-  auto* cpu_pjrt_client = cpu_client ? cpu_client->pjrt_client() : nullptr;
-  auto* gpu_pjrt_client = gpu_client ? gpu_client->pjrt_client() : nullptr;
+  auto* cpu_pjrt_client = cpu_client ? (*cpu_client)->pjrt_client() : nullptr;
+  auto* gpu_pjrt_client = gpu_client ? (*gpu_client)->pjrt_client() : nullptr;
 
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
@@ -387,7 +390,7 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
         "Note that a DLPack tensor may be consumed at most once.",
         std::string_view(tensor.name()));
   }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor);
+  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
   if (dlmt->dl_tensor.ndim < 0) {
     return InvalidArgument(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
@@ -447,8 +450,8 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   // TODO(phawkins): simplify the expression below once we know cpu_client is
   // always non-null.
   auto client = (cpu_client && device->client() == cpu_pjrt_client)
-                    ? std::move(cpu_client)
-                    : std::move(gpu_client);
+                    ? std::move(*cpu_client)
+                    : std::move(*gpu_client);
   auto* ifrt_client =
       llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
   if (ifrt_client == nullptr) {
@@ -457,22 +460,20 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto ifrt_array,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  auto out = PyArray::MakeFromSingleDeviceArray(
-      std::move(client), Traceback::Get(), std::move(ifrt_array), false, true);
-  // TODO(phawkins): remove after nanobind transition is complete.
-  return py::reinterpret_steal<py::object>(out.release().ptr());
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
 }
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PjRtDevice* device,
-    std::shared_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
+absl::StatusOr<nb::object> DLPackManagedTensorToBuffer(
+    const nb::capsule& tensor, PjRtDevice* device,
+    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream) {
   if (std::string_view(tensor.name()) != kDlTensorCapsuleName) {
     return InvalidArgument(
         "DLPack tensor must be a capsule with name \"dltensor\", got \"%s\". "
         "Note that a DLPack tensor may be consumed at most once.",
         std::string_view(tensor.name()));
   }
-  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor);
+  DLManagedTensor* dlmt = static_cast<DLManagedTensor*>(tensor.data());
   if (dlmt->dl_tensor.ndim < 0) {
     return InvalidArgument(
         "Number of dimensions in DLManagedTensor must be nonnegative, got %d",
@@ -519,10 +520,8 @@ absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
   }
   TF_ASSIGN_OR_RETURN(auto ifrt_array,
                       ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
-  auto out = PyArray::MakeFromSingleDeviceArray(
-      std::move(client), Traceback::Get(), std::move(ifrt_array), false, true);
-  // TODO(phawkins): remove after nanobind transition is complete.
-  return py::reinterpret_steal<py::object>(out.release().ptr());
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/dlpack.h b/third_party/xla/xla/python/dlpack.h
index 11272e0f0032ed..069fdf1de13bb1 100644
--- a/third_party/xla/xla/python/dlpack.h
+++ b/third_party/xla/xla/python/dlpack.h
@@ -16,10 +16,13 @@ limitations under the License.
 #ifndef XLA_PYTHON_DLPACK_H_
 #define XLA_PYTHON_DLPACK_H_
 
-#include <memory>
+#include <cstdint>
 #include <optional>
 
-#include "pybind11/pybind11.h"  // from @pybind11
+#include "absl/status/statusor.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 
 namespace xla {
@@ -31,16 +34,17 @@ namespace xla {
 // stream, if set, is a GPU stream, e.g. cudaStream_t for CUDA GPUs, that should
 // be synchronized to the buffer as per
 // https://dmlc.github.io/dlpack/latest/python_spec.html#python-specification-for-dlpack.
-absl::StatusOr<pybind11::capsule> BufferToDLPackManagedTensor(
-    pybind11::handle buffer, std::optional<std::intptr_t> stream);
+absl::StatusOr<nanobind::capsule> BufferToDLPackManagedTensor(
+    nanobind::handle buffer, std::optional<std::intptr_t> stream);
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-    std::shared_ptr<PyClient> gpu_client);
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor,
+    std::optional<nb_class_ptr<PyClient>> cpu_client,
+    std::optional<nb_class_ptr<PyClient>> gpu_client);
 
-absl::StatusOr<pybind11::object> DLPackManagedTensorToBuffer(
-    const pybind11::capsule& tensor, PjRtDevice* device,
-    std::shared_ptr<PyClient> client, std::optional<std::intptr_t> stream);
+absl::StatusOr<nanobind::object> DLPackManagedTensorToBuffer(
+    const nanobind::capsule& tensor, ifrt::Device* device,
+    nb_class_ptr<PyClient> client, std::optional<std::intptr_t> stream);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index d6c7a913e60d59..c2f96ff92d35a0 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package_group(
     name = "friends",
@@ -77,6 +77,7 @@ cc_library(
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt/ir",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/python/ifrt/array.h b/third_party/xla/xla/python/ifrt/array.h
index 84730430a028de..b63d9f4c90096f 100644
--- a/third_party/xla/xla/python/ifrt/array.h
+++ b/third_party/xla/xla/python/ifrt/array.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/shape.h"
@@ -36,6 +37,8 @@ namespace ifrt {
 
 class Client;
 
+using Layout = ::xla::PjRtLayout;
+
 // Semantics for operations that may copy or move sharded buffers in an array.
 enum class ArrayCopySemantics : int {
   // Always creates new buffers to construct an output array. Mutation of the
@@ -69,6 +72,10 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   virtual const Shape& shape() const = 0;
   virtual const Sharding& sharding() const = 0;
   virtual std::shared_ptr<const Sharding> shared_ptr_sharding() const = 0;
+  // The device memory layout for each shard of the Array. All shards are
+  // assumed to have the same layout. Cannot be nullptr; implementations should
+  // return UNIMPLEMENTED instead.
+  virtual absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const = 0;
 
   // Breaks an array up into per-device arrays. This is the elimination
   // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
diff --git a/third_party/xla/xla/python/ifrt/executable.h b/third_party/xla/xla/python/ifrt/executable.h
index 94b2784d325994..612827023d4da0 100644
--- a/third_party/xla/xla/python/ifrt/executable.h
+++ b/third_party/xla/xla/python/ifrt/executable.h
@@ -63,10 +63,12 @@ class Executable : public llvm::RTTIExtends<Executable, llvm::RTTIRoot> {
       const = 0;
   // Returns a list of output `OpSharding`.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
-  // Returns a list of output/result `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetOutputLayouts() const = 0;
   // Returns an `HloModule` (optimized) per partition.
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
@@ -131,10 +133,12 @@ class LoadedExecutable
       const = 0;
   // Returns a list of output OpSharding.
   virtual std::optional<std::vector<OpSharding>> GetOutputShardings() const = 0;
-  // Returns a list of parameter `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const = 0;
-  // Returns a list of output/result `xla::Layout`s.
-  virtual absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const = 0;
+  // Returns a list of parameter layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetParameterLayouts() const = 0;
+  // Returns a list of output/result layouts.
+  virtual absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+  GetOutputLayouts() const = 0;
   // Return an HloModule (optimized) per partition.
   virtual absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
   GetHloModules() const = 0;
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index e81f592e48baff..b40e3def5c8edc 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -134,10 +134,14 @@ cc_library(
         ":ifrt_dialect_inc_gen",
         ":ifrt_interfaces_inc_gen",
         ":ifrt_ops_inc_gen",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:CallOpInterfaces",  # buildcleaner: keep
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
index 68c9e472901385..618cec5a5cf56b 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.cc
@@ -18,14 +18,20 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace ifrt {
@@ -66,14 +72,37 @@ void PopulateDevices(llvm::ArrayRef<int> permutation,
 
 }  // namespace
 
+absl::Status ShardingParam::MinorToMajor::verify() const {
+  if (permutation.size() != axis_sizes.size() || axis_sizes.empty()) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Expect same non-zero size for `permutation` and `axis_sizes`. Actual ",
+        permutation.size(), " vs ", axis_sizes.size()));
+  }
+  llvm::DenseSet<int> permutation_set(permutation.begin(), permutation.end());
+  if (permutation_set.size() != permutation.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("`permutation` [", absl::StrJoin(permutation, ","),
+                     "] has duplicate values"));
+  }
+  for (const int index : permutation) {
+    if (index < 0 || index >= axis_sizes.size()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Out of range axis ", index, " to the mesh of [",
+                       absl::StrJoin(permutation, ","), "] on ",
+                       absl::StrJoin(axis_sizes, "x")));
+    }
+  }
+  return absl::OkStatus();
+}
+
 mlir::LogicalResult ShardingParam::MinorToMajor::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
-  if (permutation.size() != axis_sizes.size() || axis_sizes.empty()) {
-    return emit_error() << "Expect same non-zero size for `permutation` and "
-                           "`axis_sizes`. Actual "
-                        << permutation.size() << " vs " << axis_sizes.size();
+  auto status = verify();
+  if (status.ok()) {
+    return mlir::success();
+  } else {
+    return emit_error() << status.message();
   }
-  return mlir::success();
 }
 
 void ShardingParam::MinorToMajor::ToDeviceList(
@@ -120,12 +149,8 @@ mlir::FailureOr<ShardingParam> ShardingParam::Parse(
   return ShardingParam(dim_shards, minor_to_major);
 }
 
-mlir::LogicalResult ShardingParam::verify(
-    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
-  if (mlir::failed(minor_to_major().verify(emit_error))) {
-    return mlir::failure();
-  }
-
+absl::Status ShardingParam::verify() const {
+  TF_RETURN_IF_ERROR(minor_to_major().verify());
   int dim_index = 0;
   int cum_size = 1;
   for (const int index : minor_to_major().permutation) {
@@ -135,19 +160,10 @@ mlir::LogicalResult ShardingParam::verify(
     if (dim_index == dim_shards().size()) {
       break;
     }
-    if (index < 0 || index >= minor_to_major().axis_sizes.size()) {
-      return emit_error() << "Out of range axis " << index << " to the mesh of "
-                          << minor_to_major().permutation << " on "
-                          << minor_to_major().axis_sizes;
-    }
-
     cum_size *= minor_to_major().axis_sizes[index];
-    if (cum_size > dim_shards()[dim_index]) {
-      return emit_error() << "Dimension #" << dim_index << " of "
-                          << dim_shards()[dim_index]
-                          << " shards can't be assigned to the axes";
-    } else if (cum_size == dim_shards()[dim_index]) {
-      cum_size = 1;
+    while (dim_index < dim_shards().size() &&
+           cum_size % dim_shards()[dim_index] == 0) {
+      cum_size /= dim_shards()[dim_index];
       dim_index++;
     }
   }
@@ -155,12 +171,22 @@ mlir::LogicalResult ShardingParam::verify(
     dim_index++;
   }
   if (dim_index != dim_shards().size()) {
-    return emit_error() << "Can't shard the dims " << dim_shards()
-                        << " to the mesh of " << minor_to_major().permutation
-                        << " on " << minor_to_major().axis_sizes;
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Can't shard the dims ", absl::StrJoin(dim_shards(), "x"),
+        " to the mesh of [", absl::StrJoin(minor_to_major().permutation, ","),
+        "] on ", absl::StrJoin(minor_to_major().axis_sizes, "x")));
   }
+  return absl::OkStatus();
+}
 
-  return mlir::success();
+mlir::LogicalResult ShardingParam::verify(
+    llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
+  auto status = verify();
+  if (status.ok()) {
+    return mlir::success();
+  } else {
+    return emit_error() << status.message();
+  }
 }
 
 std::string ShardingParam::DebugString() const {
diff --git a/third_party/xla/xla/python/ifrt/ir/sharding_param.h b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
index 5388f860b53ee7..13de6a96e9dcb5 100644
--- a/third_party/xla/xla/python/ifrt/ir/sharding_param.h
+++ b/third_party/xla/xla/python/ifrt/ir/sharding_param.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
@@ -79,6 +80,7 @@ class ShardingParam {
     // The size of mesh dimensions before the permutation.
     llvm::SmallVector<int, 4> axis_sizes;
 
+    absl::Status verify() const;
     mlir::LogicalResult verify(
         llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
 
@@ -94,6 +96,7 @@ class ShardingParam {
       : dim_shards_(dim_shards), minor_to_major_(minor_to_major) {}
 
   static mlir::FailureOr<ShardingParam> Parse(mlir::AsmParser& ods_parser);
+  absl::Status verify() const;
   mlir::LogicalResult verify(
       llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
 
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
index 339d351958d3e2..c2bf8634f1f5f1 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -64,7 +64,7 @@ func.func @array_requires_same_permutation_and_axis_sizes() {
 // -----
 
 func.func @array_requires_enough_devices() {
-  // expected-error@+2 {{Can't shard the dims 2, 2 to the mesh of 0 on 2}}
+  // expected-error@+2 {{Can't shard the dims 2x2 to the mesh of [0] on 2}}
   %0 = builtin.unrealized_conversion_cast to
       !ifrt.array<tensor<4x4xi32>, 2x2 to [0] on 2, [0,1]>
   return
@@ -73,7 +73,7 @@ func.func @array_requires_enough_devices() {
 // -----
 
 func.func @array_requires_shard_distributable_to_axes() {
-  // expected-error@+2 {{Dimension #1 of 2 shards can't be assigned to the axes}}
+  // expected-error@+2 {{Can't shard the dims 1x2 to the mesh of [0] on 3}}
   %0 = builtin.unrealized_conversion_cast to
       !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 3, [0,1,2]>
   return
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
index cc8370e81f9ad1..a34af467efe6a7 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/verify_reshard.mlir
@@ -47,7 +47,7 @@ func.func @reshard_requires_same_global_shape(
 func.func @reshard_requires_non_negative_axis_index(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis -1 to the mesh of -1 on 2}}
+  // expected-error@+3 {{Out of range axis -1 to the mesh of [-1] on 2}}
   %0 = ifrt.Reshard(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
       -> !ifrt.array<tensor<2x1xi32>, 1x2 to [-1] on 2, [2,3]>
@@ -59,7 +59,7 @@ func.func @reshard_requires_non_negative_axis_index(
 func.func @reshard_requires_valid_axis_index(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     attributes {ifrt.function} {
-  // expected-error@+3 {{Out of range axis 1234567890 to the mesh of 1234567890 on 2}}
+  // expected-error@+3 {{Out of range axis 1234567890 to the mesh of [1234567890] on 2}}
   %0 = ifrt.Reshard(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
       -> !ifrt.array<tensor<2x1xi32>, 1x2 to [1234567890] on 2, [2,3]>
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index da63d7d1543dee..0684702b988647 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -74,6 +74,10 @@ MockArray::MockArray(tsl::RCReference<xla::ifrt::Array> delegated)
   ON_CALL(*this, shared_ptr_sharding).WillByDefault([this]() {
     return delegated_->shared_ptr_sharding();
   });
+  ON_CALL(*this, layout)
+      .WillByDefault([this]() -> absl::StatusOr<std::unique_ptr<PjRtLayout>> {
+        return delegated_->layout();
+      });
   ON_CALL(*this, DisassembleIntoSingleDeviceArrays)
       .WillByDefault([this](ArrayCopySemantics semantics) {
         return delegated_->DisassembleIntoSingleDeviceArrays(semantics);
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index cc93f4f95c9982..13ae0e1483f95d 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -64,6 +64,8 @@ class MockArray final : public llvm::RTTIExtends<MockArray, Array> {
   MOCK_METHOD(const Sharding&, sharding, (), (const, final));
   MOCK_METHOD(std::shared_ptr<const Sharding>, shared_ptr_sharding, (),
               (const, final));
+  MOCK_METHOD(absl::StatusOr<std::unique_ptr<PjRtLayout>>, layout, (),
+              (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<tsl::RCReference<Array>>>,
               DisassembleIntoSingleDeviceArrays, (ArrayCopySemantics semantics),
               (final));
@@ -234,10 +236,10 @@ class MockExecutable final
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
-              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
               GetHloModules, (), (const, final));
   MOCK_METHOD(
@@ -264,10 +266,10 @@ class MockLoadedExecutable final
               (const, final));
   MOCK_METHOD(std::optional<std::vector<OpSharding>>, GetOutputShardings, (),
               (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetParameterLayouts, (),
-              (const, final));
-  MOCK_METHOD(absl::StatusOr<std::vector<Layout>>, GetOutputLayouts, (),
-              (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetParameterLayouts, (), (const, final));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<Layout>>>,
+              GetOutputLayouts, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::vector<absl::string_view>>>,
               GetOutputMemoryKinds, (), (const, final));
   MOCK_METHOD(absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>,
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index f0405ba8ca8783..33907fd34f3d13 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -40,6 +40,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
     ],
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
index fc315091c8c18e..1c6a2b3f6f5a73 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions.cc
@@ -106,14 +106,21 @@ absl::StatusOr<HloSharding> ToHloSharding(const ShardingParam& sharding_param) {
     cum_size *= dim_shard;
     dims.push_back(dim_shard);
   }
+  // Applies the inverse of the transposes from `ToShardingParam`.
+  llvm::SmallVector<int, 4> permutation;
+  int num_axis = sharding_param.minor_to_major().permutation.size();
+  permutation.reserve(num_axis);
+  for (const int axis_id :
+       llvm::reverse(sharding_param.minor_to_major().permutation)) {
+    permutation.push_back(num_axis - axis_id - 1);
+  }
   if (device_count != cum_size) {
     // Add the replicated dimension.
     dims.push_back(device_count / cum_size);
-    return HloSharding::PartialTile(TileAssignment(
-        dims, reshape_dims, sharding_param.minor_to_major().permutation));
+    return HloSharding::PartialTile(
+        TileAssignment(dims, reshape_dims, permutation));
   } else {
-    return HloSharding::IotaTile(dims, reshape_dims,
-                                 sharding_param.minor_to_major().permutation);
+    return HloSharding::IotaTile(dims, reshape_dims, permutation);
   }
 }
 
@@ -175,8 +182,16 @@ absl::StatusOr<ShardingParam> ToShardingParam(const HloSharding& hlo_sharding,
            llvm::reverse(tile_assignment.iota()->reshape_dims())) {
         minor_to_major.axis_sizes.push_back(reshape_dim);
       }
-      for (int axis_id : tile_assignment.iota()->transpose_perm()) {
-        minor_to_major.permutation.push_back(axis_id);
+      // The devices generated by HloSharding
+      // np.arange(ndevices).reshape(reshape_dims).transpose(transpose_perm)
+      // must be equal to the devices ShardingParam
+      // np.arange(ndevices).reshape(reverse(axis_size)).T.transpose(perm).T
+      // Step 1: Compute transpose(transpose_perm).T.
+      // Step 2: Compute T.transpose(transpose_perm).T.
+      int num_axis = tile_assignment.iota()->transpose_perm().size();
+      for (int axis_id :
+           llvm::reverse(tile_assignment.iota()->transpose_perm())) {
+        minor_to_major.permutation.push_back(num_axis - axis_id - 1);
       }
     }
     return ShardingParam(dim_shards, std::move(minor_to_major));
diff --git a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
index 4f9cdd2f6ffe7b..d6619f4693d0eb 100644
--- a/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
+++ b/third_party/xla/xla/python/ifrt/support/sharding_conversions_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/python/ifrt/support/sharding_conversions.h"
 
 #include <memory>
+#include <numeric>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -35,6 +36,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding_test_util.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
@@ -44,22 +46,24 @@ namespace support {
 namespace {
 
 using ::tsl::testing::StatusIs;
+using xla::HloSharding;
 
-absl::StatusOr<xla::HloSharding> ToHloShardingViaOpSharding(
+absl::StatusOr<HloSharding> ToHloShardingViaOpSharding(
     const ShardingParam& sharding_param, absl::Span<const int> device_list) {
   TF_ASSIGN_OR_RETURN(xla::OpSharding op_sharding,
                       ToOpSharding(sharding_param, device_list));
-  return xla::HloSharding::FromProto(op_sharding);
+  return HloSharding::FromProto(op_sharding);
 }
 
 TEST(ShardingConversionsTest, Replicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1, 1},
       {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -67,7 +71,7 @@ TEST(ShardingConversionsTest, Replicated) {
                           ToShardingParam(hlo_iota_sharding, 3, 6));
   // We do not compare expected_sharding_param and sharding_param because they
   // haven't been canonicalized (1x1x1 to [0, 1] on 2x3 vs. 1x1x1 to [0] on 6).
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual_hlo_sharding,
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding actual_hlo_sharding,
                           ToHloSharding(sharding_param));
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
@@ -75,10 +79,11 @@ TEST(ShardingConversionsTest, Replicated) {
 TEST(ShardingConversionsTest, SingleDeviceReplicated) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{1, 1}, {/*permutation=*/{0}, /*axis_sizes=*/{1}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0}));
   EXPECT_EQ(hlo_sharding.ToString(), "{replicated}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -91,10 +96,11 @@ TEST(ShardingConversionsTest, Permutation) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]0,3,1,4,2,5}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -106,10 +112,11 @@ TEST(ShardingConversionsTest, Permutation) {
 TEST(ShardingConversionsTest, Partial) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3, 4, 5}));
   EXPECT_EQ(hlo_sharding.ToString(),
             "{devices=[2,1,3]0,1,2,3,4,5 last_tile_dim_replicate}");
@@ -118,7 +125,7 @@ TEST(ShardingConversionsTest, Partial) {
                           ToShardingParam(hlo_iota_sharding, 2, 6));
   // We do not compare expected_sharding_param and sharding_param because they
   // haven't been canonicalized (2x1 to [0, 1] on 2x3 vs. 2x1 to [0] on 6).
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding actual_hlo_sharding,
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding actual_hlo_sharding,
                           ToHloSharding(sharding_param));
   EXPECT_EQ(hlo_iota_sharding, actual_hlo_sharding);
 }
@@ -126,10 +133,11 @@ TEST(ShardingConversionsTest, Partial) {
 TEST(ShardingConversionsTest, OneDimToTwoAxes) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{4}, {/*permutation=*/{1, 0}, /*axis_sizes=*/{2, 2}}};
-  TF_ASSERT_OK_AND_ASSIGN(const xla::HloSharding hlo_iota_sharding,
+  TF_EXPECT_OK(expected_sharding_param.verify());
+  TF_ASSERT_OK_AND_ASSIGN(const HloSharding hlo_iota_sharding,
                           ToHloSharding(expected_sharding_param));
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {0, 1, 2, 3}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[4]0,2,1,3}");
   EXPECT_EQ(hlo_sharding, hlo_iota_sharding);
@@ -142,21 +150,110 @@ TEST(ShardingConversionsTest, NonTrivialDeviceAssignment) {
   ShardingParam expected_sharding_param{
       /*dim_shards=*/{2, 1, 3},
       {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(expected_sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(expected_sharding_param, {6, 5, 4, 3, 2, 1}));
   EXPECT_EQ(hlo_sharding.ToString(), "{devices=[2,1,3]6,3,5,2,4,1}");
 }
 
+TEST(ShardingConversionsTest, VerifyIncorrectShardings) {
+  ShardingParam different_permutation_and_axis{
+      /*dim_shards=*/{1, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2}}};
+  EXPECT_FALSE(different_permutation_and_axis.verify().ok());
+  ShardingParam too_many_slices{/*dim_shards=*/{2, 2},
+                                {/*permutation=*/{0}, /*axis_sizes=*/{2}}};
+  EXPECT_FALSE(too_many_slices.verify().ok());
+  ShardingParam incorrect_permutation{
+      /*dim_shards=*/{4, 1},
+      {/*permutation=*/{0, 1, 1}, /*axis_sizes=*/{2, 2, 2}}};
+  EXPECT_FALSE(incorrect_permutation.verify().ok());
+}
+
 TEST(ShardingConversionsTest, ErrorOnDeviceAssignment) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 1, 3},
                                {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(sharding_param.verify());
   EXPECT_THAT(
       ToHloShardingViaOpSharding(sharding_param, {6, 5, 4, 3, 2}),
       StatusIs(absl::StatusCode::kOutOfRange,
                ::testing::HasSubstr("Can't map device with logical id 5")));
 }
 
+struct HloShardingTestStruct {
+  HloSharding hlo_sharding;
+  int rank;
+  int num_devices;
+};
+
+using HloShardingToShardingParamTest =
+    ::testing::TestWithParam<HloShardingTestStruct>;
+
+TEST_P(HloShardingToShardingParamTest, HloShardingToShardingParam) {
+  const auto& param = GetParam();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto sharding_param,
+      ToShardingParam(param.hlo_sharding, param.rank, param.num_devices));
+  EXPECT_TRUE(sharding_param.verify().ok());
+  TF_ASSERT_OK_AND_ASSIGN(auto actual_hlo_sharding,
+                          ToHloSharding(sharding_param));
+  EXPECT_EQ(param.hlo_sharding, actual_hlo_sharding);
+  // Verify that the conversion to OpSharding is also correct.
+  std::vector<int> device_ids(param.num_devices);
+  std::iota(device_ids.begin(), device_ids.end(), 0);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto hlo_via_op_sharding,
+      ToHloShardingViaOpSharding(sharding_param, device_ids));
+  EXPECT_EQ(param.hlo_sharding, hlo_via_op_sharding);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HloShardingConversionTests, HloShardingToShardingParamTest,
+    testing::ValuesIn<HloShardingTestStruct>({
+        {HloSharding::IotaTile({4, 2}), 2, 8},
+        {HloSharding::IotaTile({2, 4}, {4, 2}, {1, 0}), 2, 8},
+        {HloSharding::IotaTile({8, 1}), 2, 8},
+        {HloSharding::IotaTile({8, 1}, {4, 2}, {1, 0}), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({4, 1, 2}, {8}, {0})), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({2, 1, 4}, {4, 2}, {1, 0})), 2,
+         8},
+        {HloSharding::PartialTile(TileAssignment({1, 4, 2}, {8}, {0})), 2, 8},
+        {HloSharding::PartialTile(TileAssignment({1, 2, 4}, {4, 2}, {1, 0})), 2,
+         8},
+        {HloSharding::PartialTile(TileAssignment({4, 3, 2}, {2, 3, 4},
+                                                 {2, 1, 0})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({4, 2, 3}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({6, 1, 4}, {24}, {0})), 2, 24},
+        {HloSharding::PartialTile(TileAssignment({12, 1, 2}, {2, 12}, {1, 0})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({8, 1, 3}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({2, 1, 12}, {24}, {0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({3, 1, 8}, {2, 3, 4},
+                                                 {1, 0, 2})),
+         2, 24},
+        {HloSharding::PartialTile(TileAssignment({1, 4, 6}, {6, 4}, {1, 0})), 2,
+         24},
+        {HloSharding::PartialTile(TileAssignment({1, 12, 2}, {2, 12}, {1, 0})),
+         2, 24},
+
+        {HloSharding::PartialTile(TileAssignment({3, 2, 1, 4}, {2, 3, 4},
+                                                 {1, 0, 2})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({2, 4, 1, 3}, {2, 3, 4},
+                                                 {0, 2, 1})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({4, 3, 1, 2}, {2, 3, 4},
+                                                 {2, 1, 0})),
+         3, 24},
+        {HloSharding::PartialTile(TileAssignment({12, 1, 1, 2}, {2, 12},
+                                                 {1, 0})),
+         3, 24},
+    }));
+
 class ShardingConversionsEquivalentTest : public test_util::ShardingTest {
  public:
   void AssertSameTiling(const ShardingParam& sharding_param,
@@ -187,8 +284,9 @@ class ShardingConversionsEquivalentTest : public test_util::ShardingTest {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamFullySharded) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 3},
                                {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -196,8 +294,9 @@ TEST_P(ShardingConversionsEquivalentTest, ShardingParamFullySharded) {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithPermutation) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 3},
                                {/*permutation=*/{1, 0}, /*axis_sizes=*/{3, 2}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -205,8 +304,9 @@ TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithPermutation) {
 TEST_P(ShardingConversionsEquivalentTest, ShardingParamWithReplication) {
   ShardingParam sharding_param{/*dim_shards=*/{2, 1},
                                {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 3}}};
+  TF_EXPECT_OK(sharding_param.verify());
   TF_ASSERT_OK_AND_ASSIGN(
-      const xla::HloSharding hlo_sharding,
+      const HloSharding hlo_sharding,
       ToHloShardingViaOpSharding(sharding_param, {0, 1, 2, 3, 4, 5}));
   AssertSameTiling(sharding_param, hlo_sharding, Shape({6, 6}));
 }
@@ -215,10 +315,11 @@ TEST_P(ShardingConversionsEquivalentTest, OpShardingReplicated) {
   OpSharding op_sharding;
   op_sharding.set_type(OpSharding::REPLICATED);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_sharding,
-                          xla::HloSharding::FromProto(op_sharding));
+                          HloSharding::FromProto(op_sharding));
   TF_ASSERT_OK_AND_ASSIGN(auto actual, ToShardingParam(hlo_sharding, 2, 6));
   ShardingParam expected{/*dim_shards=*/{1, 1},
                          {/*permutation=*/{0}, /*axis_sizes=*/{6}}};
+  TF_EXPECT_OK(expected.verify());
   EXPECT_EQ(actual, expected);
 }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 7a6071ad0b2aee..1f8b8fe5df28f4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "if_google")
-load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -327,6 +326,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
         "//xla/python/ifrt_proxy/common:types",
@@ -478,6 +478,7 @@ ifrt_proxy_cc_test(
         ":version",
         "//xla:shape_util",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
         "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
@@ -487,6 +488,7 @@ ifrt_proxy_cc_test(
         "@com_google_googletest//:gtest_main",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
@@ -494,13 +496,23 @@ ifrt_proxy_cc_test(
     ],
 )
 
-tsl_pybind_extension(
+cc_library(
     name = "py_module",
     srcs = ["py_module.cc"],
+    hdrs = ["py_module.h"],
+    compatible_with = [],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    visibility = ["//xla/python:__pkg__"],
     deps = [
         ":grpc_client",
         ":registry",
+        "//third_party/nanobind",
         "//xla/pjrt:status_casters",
+        "//xla/python:nb_class_ptr",
         "//xla/python:py_client",
         "//xla/python/ifrt",
         "@com_google_absl//absl/log",
@@ -512,8 +524,5 @@ tsl_pybind_extension(
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
-        "@pybind11",
-        "@pybind11_abseil//pybind11_abseil:absl_casters",
-        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index 58c01c1d062906..05b9147cd7b830 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -88,7 +88,7 @@ Array::MakeArrayFromHostBuffer(
   TF_ASSIGN_OR_RETURN(
       auto response,
       rpc_helper->MakeArrayFromHostBuffer(std::move(req)).Await());
-  const ArrayHandle handle{.handle = response->array_handle()};
+  const ArrayHandle handle{response->array_handle()};
 
   if (on_done_with_host_buffer != nullptr) {
     std::move(on_done_with_host_buffer)();
@@ -185,7 +185,7 @@ Array::AssembleArrayFromSingleDeviceArrays(
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<AssembleArrayFromSingleDeviceArraysResponse> response,
       rpc_helper->AssembleArrayFromSingleDeviceArrays(std::move(req)).Await());
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   return tsl::RCReference<xla::ifrt::Array>(
       tsl::MakeRef<Array>(client, std::move(rpc_helper), arrays[0]->dtype(),
@@ -203,7 +203,7 @@ Array::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
       rpc_helper_->DisassembleIntoSingleDeviceArrays(std::move(req)).Await());
   std::vector<ArrayHandle> handles;
   for (auto& handle : response->single_device_array_handles()) {
-    handles.push_back(ArrayHandle{.handle = handle});
+    handles.push_back(ArrayHandle{handle});
   }
 
   TF_ASSIGN_OR_RETURN(auto shape_and_shardings, sharding_->Disassemble(shape_));
@@ -232,7 +232,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::FullyReplicatedShard(
       std::shared_ptr<FullyReplicatedShardResponse> response,
       rpc_helper_->FullyReplicatedShard(std::move(req)).Await());
 
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   // We are making the assumption the Array returned by the server corresponds
   // to the first device. Revisit this when IFRT supports: (1) an inexpensive
@@ -258,7 +258,7 @@ absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::Reshard(
 
   TF_ASSIGN_OR_RETURN(std::shared_ptr<ReshardResponse> response,
                       rpc_helper_->Reshard(std::move(req)).Await());
-  ArrayHandle handle{.handle = response->array_handle()};
+  ArrayHandle handle{response->array_handle()};
 
   return tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
       client_, rpc_helper_, dtype_, shape_, std::move(new_sharding), handle));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
index c17f497b1ce81d..3b5e8d9d5e1149 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -102,6 +102,10 @@ class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
   std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
     return sharding_;
   }
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const override {
+    return absl::UnimplementedError(
+        "Array::layout() not implemented for IFRT proxy");
+  };
 
   absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
index 0d80bba73cd9dc..686f533387bde3 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -98,8 +98,7 @@ TEST_F(ArrayTest, Destruction) {
 
   MockClient client;
   tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
-                      Shape({}), /*sharding=*/nullptr,
-                      ArrayHandle{.handle = 1234});
+                      Shape({}), /*sharding=*/nullptr, ArrayHandle{1234});
 }
 #endif
 
@@ -124,9 +123,9 @@ TEST_F(ArrayTest, FullyReplicatedShard) {
   auto sharding = xla::ifrt::SingleDeviceSharding::Create(
       &mock_device, xla::ifrt::MemoryKind());
 
-  auto array = tsl::MakeRef<Array>(
-      &client, rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
-      std::move(sharding), ArrayHandle{.handle = 1234});
+  auto array =
+      tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
+                          Shape({}), std::move(sharding), ArrayHandle{1234});
 
   ASSERT_THAT(array->FullyReplicatedShard(ArrayCopySemantics::kAlwaysCopy),
               IsOk());
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
index 9d8c386fd39e74..55132de4cec640 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -101,10 +101,9 @@ absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
       response->addressable_device_logical_ids_size());
   for (const auto& logical_device_id :
        response->addressable_device_logical_ids()) {
-    addressable_device_logical_device_ids.push_back({
-        .replica = logical_device_id.replica(),
-        .partition = logical_device_id.partition(),
-    });
+    xla::ifrt::LoadedExecutable::LogicalDeviceIds id{
+        logical_device_id.replica(), logical_device_id.partition()};
+    addressable_device_logical_device_ids.push_back(id);
   }
 
   std::vector<xla::ifrt::Device*> addressable_devices;
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
index 6922488d7fa006..6cb461865818d9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -38,6 +38,8 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+class Client;
+
 class DeviceDescription final : public xla::PjRtDeviceDescription {
  public:
   DeviceDescription(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index 40d540c954e8de..ea110d6ed824c9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -38,6 +38,7 @@
 #include "xla/layout.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
@@ -356,15 +357,30 @@ std::optional<std::vector<OpSharding>> LoadedExecutable::GetOutputShardings()
   return (*info)->output_shardings;
 }
 
-absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetParameterLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+LoadedExecutable::GetParameterLayouts() const {
   TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
-  return info->parameter_layouts;
+  TF_RETURN_IF_ERROR(info->parameter_layouts.status());
+
+  std::vector<std::unique_ptr<Layout>> result;
+  result.reserve(info->parameter_layouts->size());
+  for (const xla::Layout& layout : *info->parameter_layouts) {
+    result.push_back(std::make_unique<xla::PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
-absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetOutputLayouts() const {
+absl::StatusOr<std::vector<std::unique_ptr<Layout>>>
+LoadedExecutable::GetOutputLayouts() const {
   TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
-  return info->output_layouts;
+  TF_RETURN_IF_ERROR(info->output_layouts.status());
+
+  std::vector<std::unique_ptr<Layout>> result;
+  result.reserve(info->output_layouts->size());
+  for (const xla::Layout& layout : *info->output_layouts) {
+    result.push_back(std::make_unique<xla::PjRtXlaLayout>(layout));
+  }
+  return result;
 }
 
 absl::StatusOr<std::vector<std::vector<absl::string_view>>>
@@ -427,9 +443,8 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
     result.outputs.clear();  // Cleaned up by `~Array()`.
 
     for (; index < response->outputs_size(); ++index) {
-      Array::Destruct(
-          rpc_helper_.get(),
-          ArrayHandle{.handle = response->outputs(index).array_handle()});
+      Array::Destruct(rpc_helper_.get(),
+                      ArrayHandle{response->outputs(index).array_handle()});
     }
   };
   const auto lookup_device = absl::bind_front(&Client::LookupDevice, client());
@@ -440,7 +455,7 @@ LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
                         FromShardingProto(lookup_device, output.sharding()));
     result.outputs.push_back(tsl::MakeRef<Array>(
         client(), rpc_helper_, dtype, std::move(shape), std::move(sharding),
-        ArrayHandle{.handle = output.array_handle()}));
+        ArrayHandle{output.array_handle()}));
   }
   std::move(cleanup).Cancel();
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index 3727e87ba18967..d6b12d97211911 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -76,8 +76,10 @@ class LoadedExecutable final
 
   std::optional<std::vector<OpSharding>> GetParameterShardings() const override;
   std::optional<std::vector<OpSharding>> GetOutputShardings() const override;
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override;
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override;
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override;
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override;
   absl::StatusOr<std::vector<std::vector<absl::string_view>>>
   GetOutputMemoryKinds() const override;
   absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 8d8e71b38c8fe7..04b466d7529062 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -25,6 +25,7 @@
 #include "llvm/Support/Casting.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
@@ -44,6 +45,7 @@
 #include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
 #include "xla/python/ifrt_proxy/common/types.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/casts.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
@@ -151,13 +153,22 @@ TEST_F(LoadedExecutableTest, Metadata) {
                             tile_assignment_dimensions: [ 0, 1 ])pb"))));
   EXPECT_THAT(executable.GetOutputShardings(),
               Optional(ElementsAre(EquivToProto(R"pb(type: REPLICATED)pb"))));
-  EXPECT_THAT(executable.GetParameterLayouts(),
-              IsOkAndHolds(ElementsAre(
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
-  EXPECT_THAT(executable.GetOutputLayouts(),
-              IsOkAndHolds(ElementsAre(
-                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
+  ASSERT_OK_AND_ASSIGN(auto parameter_layouts,
+                       executable.GetParameterLayouts());
+  EXPECT_EQ(parameter_layouts.size(), 2);
+  EXPECT_EQ(
+      tensorflow::down_cast<xla::PjRtXlaLayout*>(parameter_layouts[0].get())
+          ->xla_layout(),
+      xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1));
+  EXPECT_EQ(
+      tensorflow::down_cast<xla::PjRtXlaLayout*>(parameter_layouts[1].get())
+          ->xla_layout(),
+      xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
+  ASSERT_OK_AND_ASSIGN(auto output_layouts, executable.GetOutputLayouts());
+  EXPECT_EQ(output_layouts.size(), 1);
+  EXPECT_EQ(tensorflow::down_cast<xla::PjRtXlaLayout*>(output_layouts[0].get())
+                ->xla_layout(),
+            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2));
   EXPECT_THAT(executable.GetOutputMemoryKinds(),
               IsOkAndHolds(ElementsAre(ElementsAre("foo"))));
 }
@@ -236,8 +247,7 @@ TEST_F(LoadedExecutableTest, Execute) {
   for (const uint64_t handle : {1000, 1001}) {
     args.push_back(tsl::MakeRef<Array>(
         &client, rpc_helper_, DType(DType::kF32), Shape({2, 2}),
-        OpaqueSharding::Create(devices, MemoryKind()),
-        ArrayHandle{.handle = handle}));
+        OpaqueSharding::Create(devices, MemoryKind()), ArrayHandle{handle}));
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/memory.h b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
index 1bf8c584cd3dd3..e33c3a1a30ac83 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/memory.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
@@ -31,6 +31,8 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
+class Client;
+
 class Memory : public xla::ifrt::Memory {
  public:
   Memory(int id, std::string memory_space_kind, std::string debug_string,
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
index 4b407bb438bb71..dca0b74cf1ee72 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "xla/python/ifrt_proxy/client/py_module.h"
 
 #include <functional>
 #include <memory>
@@ -25,32 +26,31 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/detail/common.h"  // from @pybind11
-#include "pybind11/functional.h"  // from @pybind11  // NOLINT  // IWYU pragma: keep
-#include "pybind11/gil.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // NOLINT  // IWYU pragma: keep
-#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 
+namespace nb = ::nanobind;
+
 namespace xla {
 namespace ifrt {
 namespace proxy {
 namespace {
 
 struct PyClientConnectionOptions {
-  std::function<void(absl::Status)> on_disconnect;
-  std::function<void(std::string)> on_connection_update;
+  std::optional<std::function<void(std::string)>> on_disconnect;
+  std::optional<std::function<void(std::string)>> on_connection_update;
 };
 
-absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
+absl::StatusOr<nb_class_ptr<PyClient>> GetClient(
     std::string proxy_server_address,
     const PyClientConnectionOptions& py_options) {
   DCHECK(PyGILState_Check());
@@ -64,16 +64,16 @@ absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
     // or even deadlock. A unique_ptr or `absl::AnyInvocable` is not sufficient
     // because downstream code can make copies. Reference:
     // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
-    auto py_on_disconnect = std::make_shared<std::function<void(absl::Status)>>(
-        std::move(py_options.on_disconnect));
+    auto py_on_disconnect = std::make_shared<std::function<void(std::string)>>(
+        std::move(*py_options.on_disconnect));
 
     options.on_disconnect =
         [on_disconnect = std::move(py_on_disconnect)](absl::Status s) mutable {
           LOG(WARNING) << "Connection to server failed, calling supplied "
                        << "`on_disconnect` function: " << s;
           tsl::Env::Default()->SchedClosure([s, on_disconnect]() mutable {
-            pybind11::gil_scoped_acquire gil_acquire;
-            (*on_disconnect)(s);
+            nb::gil_scoped_acquire gil_acquire;
+            (*on_disconnect)(s.ToString());
             on_disconnect = nullptr;
           });
         };
@@ -81,39 +81,41 @@ absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
 
   if (py_options.on_connection_update) {
     auto fn = std::make_shared<std::function<void(std::string)>>(
-        std::move(py_options.on_connection_update));
+        std::move(*py_options.on_connection_update));
     options.on_connection_update = [fn](absl::string_view log_line) -> void {
       tsl::Env::Default()->SchedClosure([fn, str = std::string(log_line)] {
-        pybind11::gil_scoped_acquire gil_acquire;
+        nb::gil_scoped_acquire gil_acquire;
         (*fn)(std::string(str));
       });
     };
   }
 
   {
-    pybind11::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(client, CreateClient(proxy_server_address, options));
   }
 
   // Constructing `xla::PyClient` requires GIL as it may dec-ref Python objects.
-  return std::make_shared<xla::PyClient>(std::move(client));
+  return xla::PyClient::Make(std::move(client));
 }
 
 }  // namespace
-}  // namespace proxy
-}  // namespace ifrt
-}  // namespace xla
 
-PYBIND11_MODULE(py_module, m) {
-  pybind11_protobuf::ImportNativeProtoCasters();
+void BuildIfrtProxySubmodule(nb::module_& m) {
+  nb::module_ sub_module = m.def_submodule("ifrt_proxy", "IFRT proxy");
 
-  using ::xla::ifrt::proxy::PyClientConnectionOptions;
-  pybind11::class_<PyClientConnectionOptions>(m, "ClientConnectionOptions")
-      .def(pybind11::init<>())
-      .def_readwrite("on_disconnect", &PyClientConnectionOptions::on_disconnect)
-      .def_readwrite("on_connection_update",
-                     &PyClientConnectionOptions::on_connection_update);
+  nb::class_<PyClientConnectionOptions>(sub_module, "ClientConnectionOptions")
+      .def(nb::init<>())
+      .def_rw("on_disconnect", &PyClientConnectionOptions::on_disconnect,
+              nb::arg().none())
+      .def_rw("on_connection_update",
+              &PyClientConnectionOptions::on_connection_update,
+              nb::arg().none());
 
-  m.def("get_client", xla::ValueOrThrowWrapper(xla::ifrt::proxy::GetClient),
-        pybind11::arg("proxy_server_address"), pybind11::arg("options"));
+  sub_module.def("get_client", xla::ValueOrThrowWrapper(GetClient),
+                 nb::arg("proxy_server_address"), nb::arg("options"));
 }
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/xla.h b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
similarity index 62%
rename from third_party/xla/xla/python/xla.h
rename to third_party/xla/xla/python/ifrt_proxy/client/py_module.h
index a7de1a11c76cda..3e104aba93ad70 100644
--- a/third_party/xla/xla/python/xla.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PYTHON_XLA_H_
-#define XLA_PYTHON_XLA_H_
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
 
-// placeholder for index annotation headers
-#include "pybind11/pybind11.h"  // from @pybind11
+#include "third_party/nanobind/include/nanobind/nanobind.h"
 
 namespace xla {
+namespace ifrt {
+namespace proxy {
 
-PyObject *InitializeXlaExtension();
+void BuildIfrtProxySubmodule(nanobind::module_& m);
 
+}  // namespace proxy
+}  // namespace ifrt
 }  // namespace xla
 
-#endif  // XLA_PYTHON_XLA_H_
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_PY_MODULE_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
index 6d8d32b9e0fc5f..fff6b664a95013 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "if_google")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 # copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = default_ifrt_proxy_visibility,
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.proto b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
index 7ef48aed10c54b..2de88772abe906 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/types.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
@@ -85,7 +85,7 @@ message Variant {
   }
 
   oneof value {
-    string string_value = 1;
+    bytes string_value = 1;
     sfixed64 int64_value = 2;
     Int64List int64_list = 3;
     float float_value = 4;
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
index b86f65e9c3596a..b05846d91e0d21 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
@@ -31,8 +31,6 @@ pytype_strict_library(
     # copybara:uncomment_end
     deps = [
         "//xla/python:xla_client",
-        "//xla/python/ifrt_proxy/client:py_module",
-        "@pybind11_abseil//pybind11_abseil:status",
     ],
 )
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
index 746575cdd61135..4b46a2ea0317bf 100644
--- a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Library to help create a IFRT proxy client."""
+"""Library to help create a IFRT proxy client.
+
+This library is no longer recommended nor used in OSS; it is used internally
+within google code. TODO(madthanu): Remove library.
+"""
 
 import dataclasses
 from typing import Callable, Optional
 
-from pybind11_abseil import status
 from xla.python import xla_client
-from xla.python.ifrt_proxy.client import py_module
 
 
 @dataclasses.dataclass
@@ -35,7 +37,7 @@ class ConnectionOptions:
       provided as human-readable strings, and an end-user may find them helpful.
   """
 
-  on_disconnect: Optional[Callable[[status.Status], None]] = None
+  on_disconnect: Optional[Callable[[str], None]] = None
   on_connection_update: Optional[Callable[[str], None]] = None
 
 
@@ -47,6 +49,7 @@ def get_client(proxy_server_address: str) -> xla_client.Client:
   """Creates an IFRT Proxy client for the given server address."""
   global _backend_created
   _backend_created = True
+  py_module = xla_client._xla.ifrt_proxy  # pylint: disable=protected-access
   cpp_options = py_module.ClientConnectionOptions()
   cpp_options.on_disconnect = _connection_options.on_disconnect
   cpp_options.on_connection_update = _connection_options.on_connection_update
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 70dfde6ecc0210..3b072a9744b976 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -152,8 +152,10 @@ cc_library(
         ":host_callback",
         ":version",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:serdes",
         "//xla/python/ifrt_proxy/common:array_util",
@@ -201,6 +203,7 @@ ifrt_proxy_cc_test(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:mock",
         "//xla/python/ifrt:serdes",
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
index a675ab40c66ef0..43e13700c293bd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
@@ -154,8 +154,8 @@ absl::Status RemoteLoadedHostCallback::Execute(void** result_ptrs,
         buffers.reserve(args.size());
         for (int i = 0; i < args.size(); ++i) {
           const int64_t size = xla::ShapeUtil::ByteSizeOf(args[i].shape);
-          buffers.push_back(RemoteLoadedHostCallbackQueue::Buffer{
-              .data = ptrs[i], .size = size});
+          buffers.push_back(
+              RemoteLoadedHostCallbackQueue::Buffer{ptrs[i], size});
         }
       };
   to_buffer(host_callback().operands, operand_ptrs, request.operands);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 592bac91e718f5..91ec171deae7ff 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -40,6 +40,7 @@
 #include "llvm/Support/Casting.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -58,6 +59,7 @@
 #include "xla/python/ifrt_proxy/server/host_callback.h"
 #include "xla/python/ifrt_proxy/server/version.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
@@ -821,8 +823,15 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
         parameter_layouts.ok()) {
       auto* const layouts =
           metadata_resp->mutable_parameter_layouts_list()->mutable_layouts();
-      for (const xla::Layout& layout : *parameter_layouts) {
-        layouts->Add(layout.ToProto());
+      for (const std::unique_ptr<xla::PjRtLayout>& parameter_layout :
+           *parameter_layouts) {
+        // TODO(b/329165105): use PjRtLayout::Serialize instead
+        const xla::PjRtXlaLayout* layout =
+            dynamic_cast<const xla::PjRtXlaLayout*>(parameter_layout.get());
+        TF_RET_CHECK(layout != nullptr)
+            << "IFRT proxy only supports PjRtXlaLayout, got a different "
+               "subclass";
+        layouts->Add(layout->xla_layout().ToProto());
       }
     } else {
       *metadata_resp->mutable_parameter_layouts_error() =
@@ -832,8 +841,15 @@ IfrtBackend::HandleLoadedExecutableMetadataRequest(
         output_layouts.ok()) {
       auto* const layouts =
           metadata_resp->mutable_output_layouts_list()->mutable_layouts();
-      for (const xla::Layout& layout : *output_layouts) {
-        layouts->Add(layout.ToProto());
+      for (const std::unique_ptr<xla::PjRtLayout>& output_layout :
+           *output_layouts) {
+        // TODO(b/329165105): use PjRtLayout::Serialize instead
+        const xla::PjRtXlaLayout* layout =
+            dynamic_cast<const xla::PjRtXlaLayout*>(output_layout.get());
+        TF_RET_CHECK(layout != nullptr)
+            << "IFRT proxy only supports PjRtXlaLayout, got a different "
+               "subclass";
+        layouts->Add(layout->xla_layout().ToProto());
       }
     } else {
       *metadata_resp->mutable_output_layouts_error() =
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 9ee2d1b88155fe..1da5d337863233 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -44,6 +44,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
@@ -890,8 +891,8 @@ TEST_F(IfrtBackendHandlerTest, CompileSuccess) {
       addressable_device_logical_ids;
   std::vector<xla::ifrt::Device*> addressable_devices;
   for (int i = 0; i < 4; ++i) {
-    addressable_device_logical_ids.push_back(
-        {.replica = i / 2, .partition = i % 2});
+    xla::ifrt::LoadedExecutable::LogicalDeviceIds id{i / 2, i % 2};
+    addressable_device_logical_ids.push_back(id);
     addressable_devices.push_back(&devices[i]);
   }
 
@@ -962,15 +963,19 @@ TEST_F(IfrtBackendHandlerTest, LoadedExecutableMetadata) {
     EXPECT_CALL(*executable, GetOutputShardings())
         .WillOnce(Return(std::vector<OpSharding>{op_sharding1}));
 
+    std::vector<std::unique_ptr<Layout>> parameter_layouts;
+    parameter_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1)));
+    parameter_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2)));
     EXPECT_CALL(*executable, GetParameterLayouts())
-        .WillOnce(Return(std::vector<xla::Layout>{
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
-        }));
+        .WillOnce(Return(std::move(parameter_layouts)));
+
+    std::vector<std::unique_ptr<Layout>> output_layouts;
+    output_layouts.push_back(std::make_unique<xla::PjRtXlaLayout>(
+        xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2)));
     EXPECT_CALL(*executable, GetOutputLayouts())
-        .WillOnce(Return(std::vector<xla::Layout>{
-            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
-        }));
+        .WillOnce(Return(std::move(output_layouts)));
     EXPECT_CALL(*executable, GetOutputMemoryKinds())
         .WillOnce(Return(std::vector<std::vector<absl::string_view>>{{"foo"}}));
 
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
index 726972f0d7fc6d..20ff684cd38ca4 100644
--- a/third_party/xla/xla/python/jax_jit.cc
+++ b/third_party/xla/xla/python/jax_jit.cc
@@ -38,6 +38,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/base/attributes.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -138,13 +139,60 @@ bool FetchMemoriesFlag() {
       *global_state.enable_memories);
 }
 
-std::string CallSignature::DebugString() const {
+std::string ArgumentSignature::DebugString() const {
   auto py_object_formatter = [](std::string* out, const nb::object& o) {
     out->append(nb::cast<std::string_view>(nb::str(o)));
   };
   auto treedef_formatter = [](std::string* out, const xla::PyTreeDef& d) {
     out->append(d.ToString());
   };
+  return absl::StrFormat(
+      "static args (positional + keyword): [%s], "
+      "static arg keyword names: [%s], "
+      "dynamic arg signatures (positional + keyword): [%s]"
+      "dynamic arg shardings: [%s]",
+      absl::StrJoin(static_args, ",", py_object_formatter),
+      absl::StrJoin(static_arg_names, ",", py_object_formatter),
+      absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
+      absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter));
+}
+
+bool ArgumentSignature::operator==(const ArgumentSignature& other) const {
+  if (dynamic_arg_treedefs != other.dynamic_arg_treedefs) {
+    return false;
+  }
+  auto object_ptr_equality = [](nb::handle a, nb::handle b) {
+    return a.ptr() == b.ptr();
+  };
+  if (!absl::c_equal(dynamic_arg_names, other.dynamic_arg_names,
+                     object_ptr_equality)) {
+    return false;
+  }
+  if (!absl::c_equal(static_arg_names, other.static_arg_names,
+                     object_ptr_equality)) {
+    return false;
+  }
+  return absl::c_equal(
+      static_args, other.static_args,
+      [](const nb::object& a, const nb::object& b) {
+        try {
+          return a.type().ptr() == b.type().ptr() && a.equal(b);
+        } catch (const nb::python_error& e) {
+          throw std::invalid_argument(absl::StrCat(
+              "static arguments should be comparable using __eq__."
+              "The following error was raised when comparing two objects of "
+              "types ",
+              nb::cast<std::string_view>(nb::str(a.type())), " and ",
+              nb::cast<std::string_view>(nb::str(b.type())),
+              ". The error was:\n", e.what()));
+        }
+      });
+}
+
+std::string CallSignature::DebugString() const {
+  auto py_object_formatter = [](std::string* out, const nb::object& o) {
+    out->append(nb::cast<std::string_view>(nb::str(o)));
+  };
   auto signature_formatter = [](std::string* out,
                                 const xla::PyArgSignature& s) {
     out->append(s.DebugString());
@@ -153,25 +201,20 @@ std::string CallSignature::DebugString() const {
     out->append(o ? "true" : "false");
   };
   return absl::StrFormat(
-      "static args (positional + keyword): %s\nstatic arg keyword names: %s\n"
+      "arg signature: %s\n"
       "dynamic arg signatures (positional + keyword): %s\n"
       "dynamic arg shardings: %s\n"
       "committed args: %s\n"
-      "dynamic arg keyword names: %s\n"
-      "dynamic arg treedefs: %s\n"
       "device: %s\n"
       "default_device: %s\n"
       "jax_enable_x64: %d\n"
       "jax_enable_memories: %d\n"
       "global_extra_jit_context: %s\n"
       "thread_local_extra_jit_context: %s\n",
-      absl::StrJoin(static_args, ",", py_object_formatter),
-      absl::StrJoin(static_arg_names, ",", py_object_formatter),
+      arg_signature.DebugString(),
       absl::StrJoin(dynamic_arg_signatures, ", ", signature_formatter),
       absl::StrJoin(dynamic_arg_shardings, ", ", py_object_formatter),
       absl::StrJoin(committed_args, ",", bool_formatter),
-      absl::StrJoin(dynamic_arg_names, ",", py_object_formatter),
-      absl::StrJoin(dynamic_arg_treedefs, "| ", treedef_formatter),  // new line
       device != nullptr ? device->DebugString() : "nullptr",
       OptionalDebugString(default_device), jax_enable_x64, jax_enable_memories,
       OptionalDebugString(global_extra_jit_context),
@@ -179,14 +222,7 @@ std::string CallSignature::DebugString() const {
 }
 
 bool CallSignature::operator==(const CallSignature& other) const {
-  if (dynamic_arg_treedefs != other.dynamic_arg_treedefs) {
-    return false;
-  }
-  auto object_ptr_equality = [](nb::handle a, nb::handle b) {
-    return a.ptr() == b.ptr();
-  };
-  if (!absl::c_equal(dynamic_arg_names, other.dynamic_arg_names,
-                     object_ptr_equality)) {
+  if (arg_signature != other.arg_signature) {
     return false;
   }
   if (dynamic_arg_signatures != other.dynamic_arg_signatures) {
@@ -201,10 +237,6 @@ bool CallSignature::operator==(const CallSignature& other) const {
   if (jax_enable_memories != other.jax_enable_memories) {
     return false;
   }
-  if (!absl::c_equal(static_arg_names, other.static_arg_names,
-                     object_ptr_equality)) {
-    return false;
-  }
   if (committed_args != other.committed_args) {
     return false;
   }
@@ -212,21 +244,6 @@ bool CallSignature::operator==(const CallSignature& other) const {
       // `==` on py:objects is the Python `is`. We need equal.
       absl::c_equal(dynamic_arg_shardings, other.dynamic_arg_shardings,
                     ShardingEqual) &&
-      absl::c_equal(
-          static_args, other.static_args,
-          [this](const nb::object& a, const nb::object& b) {
-            try {
-              return a.type().ptr() == b.type().ptr() && a.equal(b);
-            } catch (const nb::python_error& e) {
-              throw std::invalid_argument(absl::StrCat(
-                  "static arguments should be comparable using __eq__."
-                  "The following error was raised during a call to '",
-                  function_name, "' when comparing two objects of types ",
-                  nb::cast<std::string_view>(nb::str(a.type())), " and ",
-                  nb::cast<std::string_view>(nb::str(b.type())),
-                  ". The error was:\n", e.what()));
-            }
-          }) &&
       (global_extra_jit_context.has_value() ==
        other.global_extra_jit_context.has_value()) &&
       (!global_extra_jit_context.has_value() ||
@@ -243,41 +260,37 @@ bool CallSignature::operator==(const CallSignature& other) const {
 
 // Filter out static arguments, flatten and concatenate other arguments (i.e.
 // dynamic positional and keyword arguments), filling `arguments` in place.
-absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
-                            absl::Span<PyObject* const> keyword_args,
-                            nb::handle kwnames,
-                            absl::Span<int const> static_argnums,
-                            absl::Span<nb::str const> static_argnames,
-                            xla::PyTreeRegistry* pytree_registry,
-                            ParsedArgumentsAsBuffers& arguments) {
+absl::Status ParseArguments(
+    absl::Span<PyObject* const> positional_args,
+    absl::Span<PyObject* const> keyword_args, nb::handle kwnames,
+    absl::Span<int const> static_argnums,
+    absl::Span<nb::str const> static_argnames,
+    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
+    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args) {
   tsl::profiler::TraceMe traceme("ParseArguments");
 
-  arguments.flat_dynamic_args.reserve(positional_args.size() +
-                                      keyword_args.size());
+  flat_dynamic_args.reserve(positional_args.size() + keyword_args.size());
   if (static_argnums.empty()) {
-    arguments.signature.dynamic_arg_treedefs.reserve(positional_args.size());
+    signature.dynamic_arg_treedefs.reserve(positional_args.size());
 
     // Positional arguments.
     for (int i = 0; i < positional_args.size(); ++i) {
-      arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-      xla::PyTreeDef& pytree_def =
-          arguments.signature.dynamic_arg_treedefs.back();
-      pytree_def.Flatten(nb::handle(positional_args[i]),
-                         arguments.flat_dynamic_args);
+      signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+      xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
+      pytree_def.Flatten(nb::handle(positional_args[i]), flat_dynamic_args);
     }
   } else {
-    arguments.signature.dynamic_arg_treedefs.reserve(positional_args.size());
+    signature.dynamic_arg_treedefs.reserve(positional_args.size());
 
     // Positional arguments.
     for (int i = 0; i < positional_args.size(); ++i) {
       if (std::find(static_argnums.begin(), static_argnums.end(), i) ==
           static_argnums.end()) {
-        arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def =
-            arguments.signature.dynamic_arg_treedefs.back();
-        pytree_def.Flatten(positional_args[i], arguments.flat_dynamic_args);
+        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
+        pytree_def.Flatten(positional_args[i], flat_dynamic_args);
       } else {
-        arguments.signature.static_args.emplace_back(
+        signature.static_args.emplace_back(
             nb::borrow<nb::object>(positional_args[i]));
       }
     }
@@ -313,21 +326,20 @@ absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
       return false;
     };
 
-    arguments.signature.dynamic_arg_names.reserve(keyword_args.size());
+    signature.dynamic_arg_names.reserve(keyword_args.size());
     for (int i = 0; i < keyword_args.size(); ++i) {
       if (kwarg_is_static(kwargs[i].first)) {
-        arguments.signature.static_arg_names.push_back(
+        signature.static_arg_names.push_back(
             nb::steal<nb::object>(kwargs[i].first));
-        arguments.signature.static_args.push_back(
+        signature.static_args.push_back(
             nb::borrow<nb::object>(kwargs[i].second));
       } else {
-        arguments.signature.dynamic_arg_names.push_back(
+        signature.dynamic_arg_names.push_back(
             nb::steal<nb::object>(kwargs[i].first));
-        arguments.signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
-        xla::PyTreeDef& pytree_def =
-            arguments.signature.dynamic_arg_treedefs.back();
+        signature.dynamic_arg_treedefs.emplace_back(pytree_registry);
+        xla::PyTreeDef& pytree_def = signature.dynamic_arg_treedefs.back();
         pytree_def.Flatten(nb::handle(kwargs[i].second.ptr()),
-                           arguments.flat_dynamic_args);
+                           flat_dynamic_args);
       }
     }
   }
diff --git a/third_party/xla/xla/python/jax_jit.h b/third_party/xla/xla/python/jax_jit.h
index b16b111c4a5b70..c076ef9cbeabe1 100644
--- a/third_party/xla/xla/python/jax_jit.h
+++ b/third_party/xla/xla/python/jax_jit.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <Python.h>
 
+#include <cstddef>
 #include <optional>
 #include <stdexcept>
 #include <string>
@@ -32,13 +33,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/python/ifrt/array.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
-#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 
 namespace jax {
@@ -93,6 +92,88 @@ bool GetEnableX64();
 std::optional<nanobind::object> GetDefaultDevice();
 std::optional<nanobind::callable> GetPostHook();
 
+// An ArgumentSignature describes the static arguments to a function call, and
+// how the dynamic arguments are related to the arguments. Together with the
+// values of the dynamic arguments, this fully describes the arguments.
+struct ArgumentSignature {
+  // A PyTreeDef for each dynamic argument, positional arguments first
+  // followed by keyword arguments. Keyword arguments are in the order given
+  // by dynamic_arg_names.
+  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
+
+  // Dynamic keyword argument names. Interned, and sorted by the keyword
+  // name. Interned values are safe to compare by pointer.
+  std::vector<nanobind::object> dynamic_arg_names;
+
+  // Static arguments. Contains the positional arguments sorted in argument
+  // order, followed by static keyword arguments in the order given by
+  // `static_arg_names`.
+  std::vector<nanobind::object> static_args;
+
+  // Static keyword argument names. Interned, and sorted by keyword name.
+  std::vector<nanobind::object> static_arg_names;
+
+  bool operator==(const ArgumentSignature& other) const;
+  bool operator!=(const ArgumentSignature& other) const {
+    return !(*this == other);
+  }
+
+  std::string DebugString() const;
+};
+
+template <typename H>
+H AbslHashValue(H h, const ArgumentSignature& s) {
+  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
+                 s.dynamic_arg_names.size(), s.static_args.size(),
+                 s.static_arg_names.size());
+
+  for (const auto& name : s.dynamic_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  for (size_t i = 0; i < s.static_args.size(); ++i) {
+    const auto& static_arg = s.static_args[i];
+    Py_hash_t hash;
+    try {
+      hash = xla::nb_hash(static_arg);
+    } catch (const nanobind::python_error& e) {
+      if (!e.matches(PyExc_TypeError)) throw;
+      throw std::invalid_argument(absl::StrCat(
+          "Non-hashable static arguments are not supported. An error occurred "
+          "while trying to hash an object of type ",
+          nanobind::cast<std::string_view>(nanobind::str(static_arg.type())),
+          ", ", nanobind::cast<std::string_view>(nanobind::str(static_arg)),
+          ". The error was:\n", e.what(), "\n"));
+    }
+    h = H::combine(std::move(h), hash);
+  }
+  for (const auto& name : s.static_arg_names) {
+    h = H::combine(std::move(h), name.ptr());
+  }
+  return h;
+}
+
+// Filter out static arguments, flatten and concatenate other arguments (i.e.
+// dynamic positional and keyword arguments), filling `arguments` in place.
+// Args:
+// positional_args: positional arguments
+// keyword_args: the values of the keyword arguments
+// kwnames: either None or a tuple containing the keyword argument names
+// static_argnums: the indices of the static arguments in the positional
+//   arguments
+// static_argnames: the names of the static arguments
+// pytree_registry: the registry to use to convert the arguments to pytrees
+// arguments: output; describes the static arguments and the identities of the
+//  dynamic arguments.
+// flat_dynamic_args: output; the concatenation of the dynamic positional
+//  arguments and sorted keyword arguments.
+absl::Status ParseArguments(
+    absl::Span<PyObject* const> positional_args,
+    absl::Span<PyObject* const> keyword_args, nanobind::handle kwnames,
+    absl::Span<int const> static_argnums,
+    absl::Span<nanobind::str const> static_argnames,
+    xla::PyTreeRegistry* pytree_registry, ArgumentSignature& signature,
+    absl::InlinedVector<nanobind::object, 2>& flat_dynamic_args);
+
 // The signature of Python jitted function call, partitioned into:
 // - dynamic positional arguments (i.e. positional args which are not static)
 // - static positional arguments (i.e. the args associated to static_argnums)
@@ -106,13 +187,8 @@ struct CallSignature {
   // Not part of the signature, but we need it for error messages.
   std::string_view function_name;
 
-  // A PyTreeDef for each dynamic argument, positional arguments first
-  // followed by keyword arguments. Keyword arguments are in the order given
-  // by dynamic_arg_names.
-  absl::InlinedVector<xla::PyTreeDef, 2> dynamic_arg_treedefs;
-  // Dynamic keyword argument names. Interned, and sorted by the keyword
-  // name.
-  std::vector<nanobind::object> dynamic_arg_names;
+  ArgumentSignature arg_signature;
+
   // Shape and dtype for both the dynamic positional arguments and the keyword
   // arguments (sorted by keyword name).
   absl::InlinedVector<xla::PyArgSignature, 2> dynamic_arg_signatures;
@@ -121,13 +197,6 @@ struct CallSignature {
   // jax.Array enabled.
   std::vector<nanobind::object> dynamic_arg_shardings;
 
-  // Static arguments. Contains the positional arguments sorted in argument
-  // order, followed by static keyword arguments in the order given by
-  // `static_arg_names`.
-  std::vector<nanobind::object> static_args;
-  // Static keyword argument names. Interned, and sorted by keyword name.
-  std::vector<nanobind::object> static_arg_names;
-
   absl::InlinedVector<bool, 2> committed_args;
 
   // For JIT, we need this in the key because computation follows the data, so
@@ -155,8 +224,7 @@ struct CallSignature {
 
 template <typename H>
 H AbslHashValue(H h, const CallSignature& s) {
-  h = H::combine(std::move(h), s.dynamic_arg_treedefs,
-                 s.dynamic_arg_signatures);
+  h = H::combine(std::move(h), s.arg_signature, s.dynamic_arg_signatures);
 
   DCHECK(s.dynamic_arg_shardings.empty() ||
          s.dynamic_arg_shardings.size() == s.dynamic_arg_signatures.size());
@@ -169,35 +237,7 @@ H AbslHashValue(H h, const CallSignature& s) {
     h = H::combine(std::move(h), ShardingHash(sharding.ptr()));
   }
 
-  for (const auto& name : s.dynamic_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-
-  h = H::combine(std::move(h), s.committed_args);
-
-  h = H::combine(std::move(h), s.dynamic_arg_names.size());
-  for (const auto& static_arg : s.static_args) {
-    ssize_t hash;
-    try {
-      hash = xla::nb_hash(static_arg);
-    } catch (const nanobind::python_error& e) {
-      if (!e.matches(PyExc_TypeError)) throw;
-      throw std::invalid_argument(absl::StrCat(
-          "Non-hashable static arguments are not supported. An error occurred "
-          "during a call to '",
-          s.function_name, "' while trying to hash an object of type ",
-          nanobind::cast<std::string_view>(nanobind::str(static_arg.type())),
-          ", ", nanobind::cast<std::string_view>(nanobind::str(static_arg)),
-          ". The error was:\n", e.what(), "\n"));
-    }
-    h = H::combine(std::move(h), hash);
-  }
-  h = H::combine(std::move(h), s.static_args.size());
-  for (const auto& name : s.static_arg_names) {
-    h = H::combine(std::move(h), name.ptr());
-  }
-  h = H::combine(std::move(h), s.static_arg_names.size());
-  h = H::combine(std::move(h), s.device, s.jax_enable_x64);
+  h = H::combine(std::move(h), s.committed_args, s.device, s.jax_enable_x64);
 
   // We do not hash the extra_jit_context fields since calling Python hash
   // functions is expensive (~300ns) and we don't expect a large number of
@@ -205,33 +245,6 @@ H AbslHashValue(H h, const CallSignature& s) {
   return h;
 }
 
-// The resulting information of the parsing and conversion of the arguments.
-struct ParsedArgumentsAsBuffers {
-  // The call signature will be filled during 2 steps:
-  // - `ParseArguments` will fill the static arguments and the pytree
-  //    structures
-  // - the shapes and dtypes are filled later, by `ParseAndTransferArguments`.
-  CallSignature signature;
-  // The concatenation of the dynamic positional arguments and the sorted
-  // keyword arguments.
-  absl::InlinedVector<nanobind::object, 2> flat_dynamic_args;
-  std::vector<nanobind::object> keep_alive_objects;
-
-  xla::ifrt::Client* ifrt_client;
-  // The following is only valid if the parsing succeeds.
-  std::vector<tsl::RCReference<xla::ifrt::Array>> ifrt_arg_arrays;
-};
-
-// Filter out static arguments, flatten and concatenate other arguments (i.e.
-// dynamic positional and keyword arguments), filling `arguments` in place.
-absl::Status ParseArguments(absl::Span<PyObject* const> positional_args,
-                            absl::Span<PyObject* const> keyword_args,
-                            nanobind::handle kwnames,
-                            absl::Span<int const> static_argnums,
-                            absl::Span<nanobind::str const> static_argnames,
-                            xla::PyTreeRegistry* pytree_registry,
-                            ParsedArgumentsAsBuffers& arguments);
-
 // The function to call in `xla.cc` to add the bindings for this module.
 void BuildJaxjitSubmodule(nanobind::module_& m);
 
diff --git a/third_party/xla/xla/python/nb_absl_flat_hash_set.h b/third_party/xla/xla/python/nb_absl_flat_hash_set.h
new file mode 100644
index 00000000000000..9927f2356765e9
--- /dev/null
+++ b/third_party/xla/xla/python/nb_absl_flat_hash_set.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+#define XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/detail/nb_set.h"
+
+namespace nanobind {
+namespace detail {
+
+template <typename Key, typename Hash, typename Eq, typename Alloc>
+struct type_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>>
+    : set_caster<absl::flat_hash_set<Key, Hash, Eq, Alloc>, Key> {};
+
+}  // namespace detail
+}  // namespace nanobind
+
+#endif  // XLA_PYTHON_NB_ABSL_FLAT_HASH_SET_H_
diff --git a/third_party/xla/xla/python/nb_class_ptr.h b/third_party/xla/xla/python/nb_class_ptr.h
index 68283d4ce875fe..c2be986c896934 100644
--- a/third_party/xla/xla/python/nb_class_ptr.h
+++ b/third_party/xla/xla/python/nb_class_ptr.h
@@ -39,7 +39,7 @@ class nb_class_ptr : public nanobind::object {
 
   T* operator->() const { return nanobind::inst_ptr<T>(ptr()); }
   T& operator*() const { return *nanobind::inst_ptr<T>(ptr()); }
-  T* get() const { return nanobind::inst_ptr<T>(ptr()); }
+  T* get() const { return ptr() ? nanobind::inst_ptr<T>(ptr()) : nullptr; }
 };
 
 // This function is analogous to std::make_unique<T>(...), but instead it
diff --git a/third_party/xla/xla/python/nb_helpers.h b/third_party/xla/xla/python/nb_helpers.h
index 9db7f89248c8e0..b0bbab98439c73 100644
--- a/third_party/xla/xla/python/nb_helpers.h
+++ b/third_party/xla/xla/python/nb_helpers.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <Python.h>
 
+#include "absl/strings/str_format.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 
 namespace xla {
@@ -30,6 +31,17 @@ Py_hash_t nb_hash(nanobind::handle o);
 // TODO(phawkins): consider upstreaming this to nanobind.
 bool nb_isinstance(nanobind::handle inst, nanobind::handle cls);
 
+// Issues a Python deprecation warning. Throws a C++ exception if issuing the
+// Python warning causes a Python exception to be raised.
+template <typename... Args>
+void PythonDeprecationWarning(const absl::FormatSpec<Args...>& format,
+                              const Args&... args) {
+  if (PyErr_WarnEx(PyExc_DeprecationWarning,
+                   absl::StrFormat(format, args...).c_str(), 1) < 0) {
+    throw nanobind::python_error();
+  }
+}
+
 // Variant of NB_TYPE_CASTER that doesn't define from_cpp()
 #define NB_TYPE_CASTER_FROM_PYTHON_ONLY(Value_, descr)   \
   using Value = Value_;                                  \
diff --git a/third_party/xla/xla/python/nb_numpy.cc b/third_party/xla/xla/python/nb_numpy.cc
index 5dde00d75a35e5..2210f67569a283 100644
--- a/third_party/xla/xla/python/nb_numpy.cc
+++ b/third_party/xla/xla/python/nb_numpy.cc
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
 
@@ -81,6 +81,19 @@ nb_numpy_ndarray::nb_numpy_ndarray(
   m_ptr = array.release().ptr();
 }
 
+/*static*/ nb_numpy_ndarray nb_numpy_ndarray::from_any(nanobind::handle h,
+                                                       int extra_requirements) {
+  nb::handle out = PyArray_FromAny(
+      h.ptr(), /*dtype=*/nullptr, /*min_depth=*/0,
+      /*max_depth=*/0,
+      /*requirements=*/NPY_ARRAY_ENSUREARRAY | extra_requirements,
+      /*context=*/nullptr);
+  if (PyErr_Occurred()) {
+    throw nb::python_error();
+  }
+  return nb::steal<nb_numpy_ndarray>(out);
+}
+
 /*static*/ nb_numpy_ndarray nb_numpy_ndarray::ensure(nanobind::handle h,
                                                      int extra_requirements) {
   nb::handle out = PyArray_FromAny(
diff --git a/third_party/xla/xla/python/nb_numpy.h b/third_party/xla/xla/python/nb_numpy.h
index 9b1cd6de603b4c..23dc85f7ce900c 100644
--- a/third_party/xla/xla/python/nb_numpy.h
+++ b/third_party/xla/xla/python/nb_numpy.h
@@ -30,7 +30,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 #if NPY_ABI_VERSION < 0x02000000
 #define PyDataType_ELSIZE(descr) ((descr)->elsize)
@@ -85,6 +85,10 @@ class nb_numpy_ndarray : public nanobind::object {
   static nb_numpy_ndarray ensure(nanobind::handle h,
                                  int extra_requirements = 0);
 
+  // Constructs a numpy ndarray via the PyArray_From Any API. This throws an
+  // error if an exception occurs.
+  static nb_numpy_ndarray from_any(nanobind::handle h, int extra_requirements);
+
   nb_dtype dtype() const;
   npy_intp ndim() const;
   const npy_intp* shape() const;
diff --git a/third_party/xla/xla/python/ops.cc b/third_party/xla/xla/python/ops.cc
index 9f4298fa47bda4..87f7242a5940d1 100644
--- a/third_party/xla/xla/python/ops.cc
+++ b/third_party/xla/xla/python/ops.cc
@@ -312,7 +312,9 @@ void BuildOpsSubmodule(nb::module_& m) {
       .value("API_VERSION_STATUS_RETURNING",
              CustomCallApiVersion::API_VERSION_STATUS_RETURNING)
       .value("API_VERSION_STATUS_RETURNING_UNIFIED",
-             CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED);
+             CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED)
+      .value("API_VERSION_TYPED_FFI",
+             CustomCallApiVersion::API_VERSION_TYPED_FFI);
 
   ops.def("AfterAll", &AfterAll, nb::arg("builder"), nb::arg("tokens"));
   ops.def("AllGather", &AllGather, nb::arg("operand"),
diff --git a/third_party/xla/xla/python/outfeed_receiver_py.cc b/third_party/xla/xla/python/outfeed_receiver_py.cc
index 5563e53720b0a0..e802550840277c 100644
--- a/third_party/xla/xla/python/outfeed_receiver_py.cc
+++ b/third_party/xla/xla/python/outfeed_receiver_py.cc
@@ -27,17 +27,16 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/functional.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/outfeed_receiver.h"
 #include "xla/python/py_client.h"
 #include "xla/python/types.h"
@@ -46,7 +45,6 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -56,11 +54,11 @@ class OutfeedReceiverForPython {
  public:
   // A callback to Python takes: consumer id, received literal.
   using CallbackToPython =
-      std::function<void(ClientAndPtr<PjRtDevice>, uint32_t, py::object)>;
+      std::function<void(nb_class_ptr<PyDevice>, uint32_t, nb::object)>;
 
   OutfeedReceiverForPython(
       CallbackToPython callback_python,
-      std::vector<std::shared_ptr<PyClient>> clients,
+      std::vector<nb_class_ptr<PyClient>> clients,
       ssize_t max_callback_queue_size_bytes,
       const std::optional<ExecutableBuildOptions>& executable_build_options)
       : callback_python_(std::move(callback_python)),
@@ -72,7 +70,7 @@ class OutfeedReceiverForPython {
         };
     std::vector<PjRtClient*> client_ptrs(clients_.size());
     absl::c_transform(clients_, client_ptrs.begin(),
-                      [](const std::shared_ptr<PyClient>& client) {
+                      [](const nb_class_ptr<PyClient>& client) {
                         return client->pjrt_client();
                       });
     outfeed_receiver_ = std::make_unique<OutfeedReceiver>(
@@ -118,24 +116,24 @@ class OutfeedReceiverForPython {
     }
     // We expect the number of clients to be small, so an O(n) search is fine.
     auto it = absl::c_find_if(
-        clients_, [device](const std::shared_ptr<PyClient>& client) {
+        clients_, [device](const nb_class_ptr<PyClient>& client) {
           return client->pjrt_client() == device->client();
         });
     CHECK(it != clients_.end());
+    PyClient* client = it->get();
     nb::gil_scoped_acquire gil_acquire;  // Need GIL also for LiteralToPython
     nb::object literal_python = LiteralToPython(std::move(literal)).value();
     // The callback_ should handle all exceptions in user-code. If we get
     // an exception here, it is a bug in the callback and we should stop.
-    callback_python_(
-        WrapWithClient<PjRtDevice>(*it, device), consumer_id,
-        py::reinterpret_steal<py::object>(literal_python.release().ptr()));
+    callback_python_(client->GetPyDevice(device), consumer_id,
+                     std::move(literal_python));
   }
 
  private:
   CallbackToPython callback_python_;
   absl::Mutex mu_;
   bool outfeed_receiver_shutting_down_ ABSL_GUARDED_BY(mu_) = false;
-  std::vector<std::shared_ptr<PyClient>> clients_;
+  std::vector<nb_class_ptr<PyClient>> clients_;
   std::unique_ptr<OutfeedReceiver> outfeed_receiver_;
 };
 
@@ -146,17 +144,13 @@ void BuildOutfeedReceiverSubmodule(nb::module_& m) {
       m.def_submodule("outfeed_receiver", "Outfeed receiver");
   outfeed_receiver.def(
       "start",
-      [](nb::object callback_to_python, nb::object clients,
-         ssize_t max_callback_queue_size_bytes,
+      [](OutfeedReceiverForPython::CallbackToPython callback_to_python,
+         nb::sequence clients, ssize_t max_callback_queue_size_bytes,
          std::optional<ExecutableBuildOptions> executable_build_options)
           -> std::unique_ptr<OutfeedReceiverForPython> {
-        // TODO(phawkins): after the nanobind transition, pass
-        // clients as a std::vector<std::shared_ptr<PyClient>>.
         auto server = std::make_unique<OutfeedReceiverForPython>(
-            py::cast<OutfeedReceiverForPython::CallbackToPython>(
-                py::handle(callback_to_python.ptr())),
-            py::cast<std::vector<std::shared_ptr<PyClient>>>(
-                py::handle(clients.ptr())),
+            std::move(callback_to_python),
+            SequenceToVector<nb_class_ptr<PyClient>>(clients),
             max_callback_queue_size_bytes, executable_build_options);
         nb::gil_scoped_release gil_release;
         server->Start();
diff --git a/third_party/xla/xla/python/pjit.cc b/third_party/xla/xla/python/pjit.cc
index 09f285fd10794a..c562eaa2eadd65 100644
--- a/third_party/xla/xla/python/pjit.cc
+++ b/third_party/xla/xla/python/pjit.cc
@@ -228,7 +228,7 @@ class PjitFunction {
       return PjitFunction::AsPjitFunctionUnchecked(*this);
     }
   };
-  // Alias as ::object; outside the scope above we won't confuse pybind11's
+  // Alias as ::object; outside the scope above we won't confuse nanobind's
   // macros.
   using object = pyobject;
 
@@ -275,10 +275,11 @@ class PjitFunction {
   }
 
  private:
-  absl::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments);
+  absl::Status ComputeCallSignature(
+      absl::Span<nb::object const> flat_dynamic_args,
+      CallSignature& call_signature);
 
   void PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                          const CallSignature& signature,
                           const nb::tuple& out_and_fastpath_data);
 
   std::string function_name_;
@@ -352,30 +353,32 @@ PjitFunction::~PjitFunction() { GetGlobalPjitFunctionStore().Erase(this); }
 void CallShardArgFallback(
     nb::handle arg, nb::handle sharding, const nb::callable& fallback,
     std::vector<tsl::RCReference<xla::ifrt::Array>>& num_args_arrays,
-    ParsedArgumentsAsBuffers& arguments) {
+    std::vector<nb::object>& keep_alive_objects) {
   tsl::profiler::TraceMe traceme("cpp_pjit_shard_arg_fallback");
   auto py_array_or_bufs = fallback(arg, sharding);
   auto py_array = nb::cast<xla::PyArray>(py_array_or_bufs);
   num_args_arrays.push_back(tsl::FormRef(py_array.ifrt_array()));
-  arguments.keep_alive_objects.push_back(std::move(py_array_or_bufs));
+  keep_alive_objects.push_back(std::move(py_array_or_bufs));
 }
 
 // Prepares the input PjRtBuffers from the python arguments. This is equivalent
 // to shard_args() in pxla.py but for only a few supported cases.
 absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
 PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
-                  ParsedArgumentsAsBuffers& arguments,
-                  const std::vector<bool>& kept_args,
+                  absl::Span<nb::object const> flat_dynamic_args,
+                  bool enable_x64, const std::vector<bool>& kept_args,
                   const std::vector<nb::object>& in_shardings,
-                  const nb::callable& shard_arg_fallback) {
-  const auto& addressable_devices = executable.AddressableDevices();
-  int num_args = arguments.flat_dynamic_args.size();
+                  const nb::callable& shard_arg_fallback,
+                  std::vector<nb::object>& keep_alive_objects) {
+  const auto& addressable_devices =
+      executable.ifrt_loaded_executable()->addressable_devices();
+  int num_args = flat_dynamic_args.size();
 
   std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays;
   num_args_arrays.reserve(num_args);
 
   xla::DevicePutOptions options;
-  options.squash_64bit_types = !arguments.signature.jax_enable_x64;
+  options.squash_64bit_types = !enable_x64;
   options.allow_zero_copy = true;
   xla::PjRtDevice* data_device = nullptr;
   if (executable.ifrt_loaded_executable()->num_devices() == 1) {
@@ -389,7 +392,7 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
     int dce_index = dce_i;
     ++dce_i;
 
-    const nb::object& arg = arguments.flat_dynamic_args[i];
+    const nb::object& arg = flat_dynamic_args[i];
 
     auto transfer_guard_formatter = [] { return std::string(""); };
 
@@ -404,13 +407,13 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
 
         num_args_arrays.push_back(std::move(on_device.ifrt_array));
         if (on_device.owning_pybuffer) {
-          arguments.keep_alive_objects.push_back(
-              std::move(on_device.owning_pybuffer));
+          keep_alive_objects.push_back(std::move(on_device.owning_pybuffer));
         }
         continue;
       } else {
         CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                             shard_arg_fallback, num_args_arrays, arguments);
+                             shard_arg_fallback, num_args_arrays,
+                             keep_alive_objects);
         continue;
       }
     }
@@ -427,26 +430,28 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
 
     if (sharding.type().ptr() == jax::PmapSharding::type().ptr()) {
       CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                           shard_arg_fallback, num_args_arrays, arguments);
+                           shard_arg_fallback, num_args_arrays,
+                           keep_alive_objects);
       continue;
     }
 
     if (py_array.num_shards() != addressable_devices.size()) {
       CallShardArgFallback(arg.ptr(), in_shardings[dce_index],
-                           shard_arg_fallback, num_args_arrays, arguments);
+                           shard_arg_fallback, num_args_arrays,
+                           keep_alive_objects);
       continue;
     }
 
     xla::ifrt::Array* ifrt_array = py_array.ifrt_array();
     // PyArray inputs should have already been checked in
     // `xla::PyArgSignatureOfValue()` called by
-    // `PjitFunction::UpdateArgsSignature()`.
+    // `PjitFunction::ComputeCallSignature()`.
     DCHECK(ifrt_array != nullptr) << "PyArray has been unexpectedly deleted.";
 
-    if (sharding_num_devices == 1 && ifrt_array->sharding().devices().front() !=
-                                         addressable_devices[0].get()) {
+    if (sharding_num_devices == 1 &&
+        ifrt_array->sharding().devices().front() != addressable_devices[0]) {
       xla::ifrt::DeviceList::Devices ifrt_devices;
-      ifrt_devices.push_back(addressable_devices[0].get());
+      ifrt_devices.push_back(addressable_devices[0]);
       auto sharding = xla::ifrt::OpaqueSharding::Create(
           xla::ifrt::DeviceList(std::move(ifrt_devices)),
           ifrt_array->sharding().memory_kind());
@@ -459,7 +464,7 @@ PrepareIfrtInputs(const xla::PyLoadedExecutable& executable,
       num_args_arrays.push_back(tsl::FormRef(ifrt_array));
     }
 
-    arguments.keep_alive_objects.push_back(arg);
+    keep_alive_objects.push_back(arg);
   }
 
   return num_args_arrays;
@@ -470,7 +475,6 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
                                               size_t nargs, PyObject* kwnames) {
   tsl::profiler::TraceMe traceme(
       [&] { return absl::StrCat("PjitFunction(", function_name_, ")"); });
-  ParsedArgumentsAsBuffers arguments;
 
   // Make sure we trigger a garbage collection on JIT function calls. Otherwise
   // code like
@@ -515,9 +519,13 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   absl::Span<PyObject* const> positional_args(args, num_positional_args);
   absl::Span<PyObject* const> keyword_args(args + num_positional_args,
                                            num_keyword_args);
-  auto status =
-      ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     static_argnames_, pytree_registry_.get(), arguments);
+
+  CallSignature call_signature;
+  std::vector<nb::object> keep_alive_objects;
+  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
+  auto status = ParseArguments(
+      positional_args, keyword_args, kwnames, static_argnums_, static_argnames_,
+      pytree_registry_.get(), call_signature.arg_signature, flat_dynamic_args);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
     return fallback_to_cache_miss();
@@ -527,7 +535,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   // committed PyArray inputs. For other cases, e.g. Tracers or ShapedArray, it
   // will fallback to python. For jit, numpy arrays and scalars are also
   // allowed, which we will check later.
-  for (const auto& arg : arguments.flat_dynamic_args) {
+  for (const auto& arg : flat_dynamic_args) {
     if (arg.type().ptr() != xla::PyArray::type().ptr()) {
       continue;
     }
@@ -551,17 +559,17 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     }
   }
 
-  status = UpdateArgsSignature(arguments);
+  status = ComputeCallSignature(flat_dynamic_args, call_signature);
   if (!status.ok()) {
-    VLOG(2) << "UpdateArgsSignature failed: " << status;
+    VLOG(2) << "ComputeCallSignature failed: " << status;
     return fallback_to_cache_miss();
   }
 
-  VLOG(2) << "CallSignature:\n" << arguments.signature.DebugString();
+  VLOG(2) << "CallSignature:\n" << call_signature.DebugString();
   bool inserted = false;
   std::shared_ptr<PjitCacheEntry> cache_entry =
       executables_->GetOrCreateIfAbsent(
-          arguments.signature, [this, &inserted](const CallSignature& unused) {
+          call_signature, [this, &inserted](const CallSignature& unused) {
             inserted = true;
             return std::make_shared<PjitCacheEntry>(pytree_registry_.get());
           });
@@ -572,7 +580,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     if (inserted) {
       nb::object out_and_fastpath_data;
       nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << arguments.signature.DebugString();
+      VLOG(2) << "Cache miss for " << call_signature.DebugString();
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
@@ -582,7 +590,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
         }
         out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
 
-        PopulateCacheEntry(*cache_entry, arguments.signature, out_tuple);
+        PopulateCacheEntry(*cache_entry, out_tuple);
       } catch (const std::exception& e) {
         VLOG(2) << "cache miss fail: " << e.what();
         cache_entry->fall_back_to_python = true;
@@ -598,7 +606,7 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
     } else {
       if (cache_entry->thread_id == std::this_thread::get_id()) {
         auto error_string = absl::StrCat("Recursively calling jit: ",
-                                         arguments.signature.DebugString());
+                                         call_signature.DebugString());
         PyErr_SetString(PyExc_RecursionError, error_string.c_str());
         throw nb::python_error();
       }
@@ -616,8 +624,9 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
 
   // A vector of [num_inputs].
   auto num_args_arrays = PrepareIfrtInputs(
-      *cache_entry->executable, arguments, cache_entry->kept_var_bitvec,
-      cache_entry->in_shardings, shard_arg_fallback_);
+      *cache_entry->executable, flat_dynamic_args,
+      call_signature.jax_enable_x64, cache_entry->kept_var_bitvec,
+      cache_entry->in_shardings, shard_arg_fallback_, keep_alive_objects);
 
   if (!num_args_arrays.ok()) {
     VLOG(2) << "Failed to prepare IFRT inputs: " << num_args_arrays.status();
@@ -683,49 +692,48 @@ absl::StatusOr<nb::object> PjitFunction::Call(nb::handle callable,
   return out;
 }
 
-absl::Status PjitFunction::UpdateArgsSignature(
-    ParsedArgumentsAsBuffers& arguments) {
-  arguments.signature.function_name = function_name_;
+absl::Status PjitFunction::ComputeCallSignature(
+    absl::Span<nb::object const> flat_dynamic_args, CallSignature& signature) {
+  signature.function_name = function_name_;
 
   // Get dynamic argument signatures.
   JitState& global_state = jax::GlobalJitState();
   JitState& tls = jax::ThreadLocalJitState();
   bool jax_enable_x64 = GetEnableX64();
 
-  arguments.signature.default_device = GetDefaultDevice();
-  arguments.signature.jax_enable_x64 = jax_enable_x64;
-  arguments.signature.jax_enable_memories = GetEnableMemories();
+  signature.default_device = GetDefaultDevice();
+  signature.jax_enable_x64 = jax_enable_x64;
+  signature.jax_enable_memories = GetEnableMemories();
 
-  auto& dynamic_arg_signatures = arguments.signature.dynamic_arg_signatures;
-  dynamic_arg_signatures.reserve(arguments.flat_dynamic_args.size());
-  auto& dynamic_arg_shardings = arguments.signature.dynamic_arg_shardings;
-  dynamic_arg_shardings.reserve(arguments.flat_dynamic_args.size());
+  auto& dynamic_arg_signatures = signature.dynamic_arg_signatures;
+  dynamic_arg_signatures.reserve(flat_dynamic_args.size());
+  auto& dynamic_arg_shardings = signature.dynamic_arg_shardings;
+  dynamic_arg_shardings.reserve(flat_dynamic_args.size());
 
-  for (nb::handle arg : arguments.flat_dynamic_args) {
-    TF_ASSIGN_OR_RETURN(auto signature,
+  for (nb::handle arg : flat_dynamic_args) {
+    TF_ASSIGN_OR_RETURN(auto arg_signature,
                         xla::PyArgSignatureOfValue(arg, jax_enable_x64));
-    arguments.signature.dynamic_arg_signatures.push_back(std::move(signature));
+    signature.dynamic_arg_signatures.push_back(std::move(arg_signature));
 
     // It should be already checked previously in the entry point of
     // PjitFunction::Call().
     if (arg.type().ptr() == xla::PyArray::type().ptr()) {
       auto py_array = nb::borrow<xla::PyArray>(arg);
-      arguments.signature.dynamic_arg_shardings.push_back(py_array.sharding());
-      arguments.signature.committed_args.push_back(py_array.committed());
+      signature.dynamic_arg_shardings.push_back(py_array.sharding());
+      signature.committed_args.push_back(py_array.committed());
     } else {
-      arguments.signature.dynamic_arg_shardings.push_back(nb::none());
-      arguments.signature.committed_args.push_back(false);
+      signature.dynamic_arg_shardings.push_back(nb::none());
+      signature.committed_args.push_back(false);
     }
   }
 
-  arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
-  arguments.signature.global_extra_jit_context = global_state.extra_jit_context;
+  signature.thread_local_extra_jit_context = tls.extra_jit_context;
+  signature.global_extra_jit_context = global_state.extra_jit_context;
 
   return absl::OkStatus();
 }
 
 void PjitFunction::PopulateCacheEntry(PjitCacheEntry& cache_entry,
-                                      const CallSignature& signature,
                                       const nb::tuple& out_and_fastpath_data) {
   DCHECK_EQ(out_and_fastpath_data.size(), 2);
 
@@ -799,8 +807,10 @@ void PjitFunction::ClearPythonReferences() {
 
 struct PjitFunctionObject {
   PyObject_HEAD;
+#if PY_VERSION_HEX < 0x030C0000
   PyObject* dict;      // Dictionary for __dict__
   PyObject* weakrefs;  // Weak references; for use by the Python interpreter.
+#endif                 // PY_VERSION_HEX < 0x030C0000
   vectorcallfunc vectorcall;
   PjitFunction fun;
 };
@@ -858,8 +868,10 @@ PyObject* PjitFunction_tp_new(PyTypeObject* subtype, PyObject* args,
   PjitFunctionObject* self =
       reinterpret_cast<PjitFunctionObject*>(subtype->tp_alloc(subtype, 0));
   if (!self) return nullptr;
+#if PY_VERSION_HEX < 0x030C0000
   self->dict = nullptr;
   self->weakrefs = nullptr;
+#endif  // PY_VERSION_HEX < 0x030C0000
   self->vectorcall = PjitFunction_tp_vectorcall;
   return reinterpret_cast<PyObject*>(self);
 }
@@ -868,10 +880,12 @@ void PjitFunction_tp_dealloc(PyObject* self) {
   PyObject_GC_UnTrack(self);
   PyTypeObject* tp = Py_TYPE(self);
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (o->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
+  PyObject_ClearWeakRefs(self);
+#if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.~PjitFunction();
   tp->tp_free(self);
   Py_DECREF(tp);
@@ -882,11 +896,13 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
   // pytree_registry_ attribute of PjitFunction could in principle also have
   // python references to visit
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-#if PY_VERSION_HEX >= 0x03090000
   // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
   Py_VISIT(Py_TYPE(self));
-#endif
+#if PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->dict);
+#else
+  _PyObject_VisitManagedDict(self, visit, arg);
+#endif  // PY_VERSION_HEX < 0x030C0000
   Py_VISIT(o->fun.cache_miss().ptr());
   Py_VISIT(o->fun.shard_arg_fallback().ptr());
   if (o->fun.fun()) {
@@ -897,7 +913,11 @@ int PjitFunction_tp_traverse(PyObject* self, visitproc visit, void* arg) {
 
 int PjitFunction_tp_clear(PyObject* self) {
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
+#if PY_VERSION_HEX < 0x030C0000
   Py_CLEAR(o->dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
   o->fun.ClearPythonReferences();
   return 0;
 }
@@ -914,35 +934,11 @@ PyObject* PjitFunction_tp_descr_get(PyObject* self, PyObject* obj,
   return PyMethod_New(self, obj);
 }
 
-// Support d = instance.__dict__.
-PyObject* PjitFunction_get_dict(PyObject* self, void*) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (!o->dict) {
-    o->dict = PyDict_New();
-  }
-  Py_XINCREF(o->dict);
-  return o->dict;
-}
-
-int PjitFunction_set_dict(PyObject* self, PyObject* new_dict, void*) {
-  PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
-  if (!PyDict_Check(new_dict)) {
-    PyErr_Format(PyExc_TypeError,
-                 "__dict__ must be set to a dictionary, not a '%s'",
-                 Py_TYPE(new_dict)->tp_name);
-    return -1;
-  }
-  Py_INCREF(new_dict);
-  Py_CLEAR(o->dict);
-  o->dict = new_dict;
-  return 0;
-}
-
 static PyGetSetDef PjitFunction_tp_getset[] = {
     // Having a __dict__ seems necessary to allow !functool.wraps to override
     // __doc__.
-    {const_cast<char*>("__dict__"), PjitFunction_get_dict,
-     PjitFunction_set_dict, nullptr, nullptr},
+    {const_cast<char*>("__dict__"), PyObject_GenericGetDict,
+     PyObject_GenericSetDict, nullptr, nullptr},
     {nullptr, nullptr, nullptr, nullptr, nullptr}};
 
 PyObject* PjitFunction_tp_repr(PyObject* self) {
@@ -999,6 +995,34 @@ nb::object MakePjitFunction(
 // PjitFunction. Increment these if changing them.
 const int kPjitFunctionPickleVersion = 1;
 
+PyMemberDef PjitFunction_members[] = {
+    {"__vectorcalloffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, vectorcall)),
+     READONLY, nullptr},
+#if PY_VERSION_HEX < 0x030C0000
+    {"__dictoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, dict)), READONLY,
+     nullptr},
+    {"__weaklistoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PjitFunctionObject, weakrefs)), READONLY,
+     nullptr},
+#endif  // PY_VERSION_HEX < 0x030C0000
+    {nullptr, 0, 0, 0, nullptr},
+};
+
+PyType_Slot PjitFunction_slots[] = {
+    {Py_tp_new, reinterpret_cast<void*>(PjitFunction_tp_new)},
+    {Py_tp_dealloc, reinterpret_cast<void*>(PjitFunction_tp_dealloc)},
+    {Py_tp_traverse, reinterpret_cast<void*>(PjitFunction_tp_traverse)},
+    {Py_tp_clear, reinterpret_cast<void*>(PjitFunction_tp_clear)},
+    {Py_tp_getset, reinterpret_cast<void*>(PjitFunction_tp_getset)},
+    {Py_tp_descr_get, reinterpret_cast<void*>(PjitFunction_tp_descr_get)},
+    {Py_tp_call, reinterpret_cast<void*>(PyVectorcall_Call)},
+    {Py_tp_repr, reinterpret_cast<void*>(PjitFunction_tp_repr)},
+    {Py_tp_members, reinterpret_cast<void*>(PjitFunction_members)},
+    {0, nullptr},
+};
+
 }  // namespace
 
 void BuildPjitSubmodule(nb::module_& m) {
@@ -1032,44 +1056,30 @@ void BuildPjitSubmodule(nb::module_& m) {
 
   // We need to use heap-allocated type objects because we want to add
   // additional methods dynamically.
-  nb::object cfun;
-  {
-    nb::str name = nb::str("PjitFunction");
-    nb::str qualname = nb::str("PjitFunction");
-    PyHeapTypeObject* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-        PyType_Type.tp_alloc(&PyType_Type, 0));
-    // Caution: we must not call any functions that might invoke the GC until
-    // PyType_Ready() is called. Otherwise the GC might see a half-constructed
-    // type object.
-    CHECK(heap_type) << "Unable to create heap type object";
-    heap_type->ht_name = name.release().ptr();
-    heap_type->ht_qualname = qualname.release().ptr();
-    PyTypeObject* type = &heap_type->ht_type;
-    type->tp_name = "PjitFunction";
-    type->tp_basicsize = sizeof(PjitFunctionObject);
-    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE |
-                     Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_HAVE_VECTORCALL;
-    type->tp_new = PjitFunction_tp_new;
-    type->tp_dealloc = PjitFunction_tp_dealloc;
-    type->tp_dictoffset = offsetof(PjitFunctionObject, dict);
-    type->tp_traverse = PjitFunction_tp_traverse;
-    type->tp_clear = PjitFunction_tp_clear;
-    type->tp_weaklistoffset = offsetof(PjitFunctionObject, weakrefs);
-    type->tp_getset = PjitFunction_tp_getset;
-    type->tp_descr_get = PjitFunction_tp_descr_get;
-    type->tp_call = PyVectorcall_Call;
-    type->tp_vectorcall_offset = offsetof(PjitFunctionObject, vectorcall);
-    type->tp_repr = PjitFunction_tp_repr;
-    CHECK_EQ(PyType_Ready(type), 0);
-    PjitFunction_Type = reinterpret_cast<PyObject*>(type);
-    cfun = nb::borrow<nb::object>(PjitFunction_Type);
+  std::string name =
+      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".PjitFunction");
+  PyType_Spec PjitFunction_spec = {
+      /*.name=*/name.c_str(),
+      /*.basicsize=*/static_cast<int>(sizeof(PjitFunctionObject)),
+      /*.itemsize=*/0,
+#if PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_HAVE_VECTORCALL,
+#else   // PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_HAVE_VECTORCALL | Py_TPFLAGS_MANAGED_DICT |
+          Py_TPFLAGS_MANAGED_WEAKREF,
+#endif  // PY_VERSION_HEX < 0x030C0000
+      /*.slots=*/PjitFunction_slots,
+  };
+  PjitFunction_Type = PyType_FromSpec(&PjitFunction_spec);
+  if (!PjitFunction_Type) {
+    throw nb::python_error();
   }
-  nb::object cfun_type = nb::borrow<nb::object>(PjitFunction_Type);
+  nb::object cfun = nb::borrow<nb::object>(PjitFunction_Type);
 
   // Add PjitFunction to the xla_extension module so it can be pickled.
-  m.attr("PjitFunction") = cfun_type;
-  cfun.attr("__module__") = m.attr("__name__");
-
+  m.attr("PjitFunction") = cfun;
   cfun.attr("__getstate__") = nb::cpp_function(
       [](const PjitFunction::object& self) {
         PjitFunction* fn = self.func();
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index c635ded4633be5..a0e3a4297a8c15 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -1,7 +1,7 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package_group(
     name = "friends",
@@ -231,6 +231,7 @@ cc_library(
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_future",
+        "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:utils",
         "//xla/python/ifrt",
         "//xla/service:hlo_proto_cc",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index fb84eabe13ece7..1f676a1bfe423a 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,24 +25,29 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "xla/literal.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace ifrt {
@@ -546,10 +552,32 @@ bool PjRtArray::IsDeleted() const {
 
 std::string PjRtArray::DebugString() const {
   DCHECK(this);
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout_ptr = layout();
+  std::string layout_str =
+      layout_ptr.ok() ? (*layout_ptr)->ToString() : "<unknown>";
+
   return absl::StrFormat(
-      "PjRtArray(dtype=%s; shape=%s; sharding=%s)", dtype_.DebugString(),
+      "PjRtArray(dtype=%s; shape=%s; sharding=%s; layout=%s)",
+      dtype_.DebugString(),
       std::visit([](const auto& shape) { return shape.DebugString(); }, shape_),
-      sharding_->DebugString());
+      sharding_->DebugString(), layout_str);
+}
+
+// TODO(b/330198879): populate layout at construction instead of accessing PJRT
+// buffer directly for consistency with Pathways.
+absl::StatusOr<std::unique_ptr<PjRtLayout>> PjRtArray::layout() const {
+  CHECK(!pjrt_buffers_.empty());
+  std::unique_ptr<PjRtLayout> layout = pjrt_buffers_[0]->layout();
+#ifndef NDEBUG
+  for (int i = 1; i < pjrt_buffers_.size(); ++i) {
+    std::unique_ptr<PjRtLayout> layout_i = pjrt_buffers_[i]->layout();
+    DCHECK(*layout == *layout_i)
+        << "PjRtArray has mismatched layouts across shards! "
+        << "shard 0: " << layout->ToString() << ", shard " << i << ": "
+        << layout_i->ToString();
+  }
+#endif
+  return layout;
 }
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
index 457a890e8adb03..5542ae9e54e11d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.h
@@ -146,6 +146,8 @@ class PjRtArray final
     return sharding_;
   }
 
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() const override;
+
   absl::StatusOr<std::vector<tsl::RCReference<Array>>>
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 4195244eda139c..f667508a287be8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -114,12 +114,14 @@ class PjRtExecutable final
     return pjrt_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override {
     DCHECK(this);
     return pjrt_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override {
     DCHECK(this);
     return pjrt_executable_->GetOutputLayouts();
   }
@@ -231,12 +233,14 @@ class PjRtLoadedExecutable final
     return pjrt_loaded_executable_->GetOutputShardings();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetParameterLayouts()
+      const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetParameterLayouts();
   }
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override {
+  absl::StatusOr<std::vector<std::unique_ptr<Layout>>> GetOutputLayouts()
+      const override {
     DCHECK(this);
     return pjrt_loaded_executable_->GetOutputLayouts();
   }
diff --git a/third_party/xla/xla/python/pmap_lib.cc b/third_party/xla/xla/python/pmap_lib.cc
index 14bc9ed9234930..1dbeb30d01ef8b 100644
--- a/third_party/xla/xla/python/pmap_lib.cc
+++ b/third_party/xla/xla/python/pmap_lib.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
@@ -53,10 +53,12 @@ limitations under the License.
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/jax_jit.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_executable.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
@@ -66,18 +68,17 @@ limitations under the License.
 #include "xla/python/traceback.h"
 #include "xla/python/types.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace jax {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -203,16 +204,15 @@ absl::StatusOr<ShardArgResult> ShardArg(
     options.squash_64bit_types = !jax_enable_x64;
     options.allow_zero_copy = true;
     for (size_t i = 0; i < n_devices; ++i) {
-      auto to_device = py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(
-          py::handle(py_devices_list[i].ptr()));
-      if (to_device.get_client() == nullptr) {
+      auto to_device = nb::cast<xla::PyDevice*>(py_devices_list[i]);
+      if (to_device->client().get() == nullptr) {
         return xla::InvalidArgument("Cannot copy to unattached devices.");
       }
 
       TF_ASSIGN_OR_RETURN(
           xla::DevicePutResult on_device,
-          DevicePut(arg[indices[i]], to_device.get_client()->ifrt_client(),
-                    to_device.get(), options, xla::ifrt::MemoryKind()));
+          DevicePut(arg[indices[i]], to_device->client()->ifrt_client(),
+                    to_device->device(), options, xla::ifrt::MemoryKind()));
 
       per_device_arrays.push_back(std::move(on_device.ifrt_array));
       devices.push_back(per_device_arrays.back()->sharding().devices().front());
@@ -331,7 +331,7 @@ class PmapFunction {
   }
   const std::vector<int>& static_argnums() const { return static_argnums_; }
 
-  // nanobind::object typed subclass for PmapFunction objects.
+  // nb::object typed subclass for PmapFunction objects.
   class pyobject : public nb::object {
    public:
     NB_OBJECT(pyobject, nb::object, "PmapFunction",
@@ -341,7 +341,7 @@ class PmapFunction {
       return PmapFunction::AsPmapFunctionUnchecked(*this);
     }
   };
-  // Alias as ::object; outside the scope above we won't confuse pybind11's
+  // Alias as ::object; outside the scope above we won't confuse nanobind's
   // macros.
   using object = pyobject;
 
@@ -364,27 +364,28 @@ class PmapFunction {
   //
   // It deals with the arguments signatures and also of the global and
   // thread-local jit context.
-  absl::Status UpdateArgsSignature(ParsedArgumentsAsBuffers& arguments) {
-    arguments.signature.function_name = function_name_;
+  absl::Status ComputeCallSignature(
+      absl::Span<nb::object const> flat_dynamic_args,
+      CallSignature& signature) {
+    signature.function_name = function_name_;
 
     // Get dynamic argument signatures.
     JitState& global_state = jax::GlobalJitState();
     JitState& tls = jax::ThreadLocalJitState();
     const bool jax_enable_x64 = GetEnableX64();
-    arguments.signature.jax_enable_x64 = jax_enable_x64;
-    for (nb::handle arg : arguments.flat_dynamic_args) {
+    signature.jax_enable_x64 = jax_enable_x64;
+    for (nb::handle arg : flat_dynamic_args) {
       auto signature_or_error = xla::PyArgSignatureOfValue(arg, jax_enable_x64);
       if (!signature_or_error.ok()) {
         VLOG(2) << "PyArgSignatureOfValue failed: "
                 << signature_or_error.status();
         return signature_or_error.status();
       }
-      arguments.signature.dynamic_arg_signatures.push_back(
+      signature.dynamic_arg_signatures.push_back(
           std::move(signature_or_error).value());
     }
-    arguments.signature.thread_local_extra_jit_context = tls.extra_jit_context;
-    arguments.signature.global_extra_jit_context =
-        global_state.extra_jit_context;
+    signature.thread_local_extra_jit_context = tls.extra_jit_context;
+    signature.global_extra_jit_context = global_state.extra_jit_context;
     return absl::Status();
   }
 
@@ -405,7 +406,6 @@ class PmapFunction {
  private:
   // Mutates `cache_entry` in place.
   void PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                          const CallSignature& signature,
                           const nb::tuple& out_and_fastpath_data);
 
   bool always_fallback_to_python_ = false;
@@ -429,7 +429,6 @@ class PmapFunction {
 };
 
 void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
-                                      const CallSignature& signature,
                                       const nb::tuple& out_and_fastpath_data) {
   CHECK_EQ(out_and_fastpath_data.size(), 2);
   if (out_and_fastpath_data[1].is_none()) {
@@ -459,19 +458,19 @@ void PmapFunction::PopulateCacheEntry(PmapCacheEntry& cache_entry,
     return;
   }
   cache_entry.executable = std::move(executable);
-  const std::vector<xla::ClientAndPtr<xla::PjRtDevice>>& client_and_devices =
+  const std::vector<xla::nb_class_ptr<xla::PyDevice>>& devices =
       cache_entry.executable->AddressableDevices();
-  cache_entry.devices.reserve(client_and_devices.size());
-  for (auto& client_and_device : client_and_devices) {
-    cache_entry.devices.push_back(client_and_device.get());
+  cache_entry.devices.reserve(devices.size());
+  for (auto& device : devices) {
+    cache_entry.devices.push_back(device->device());
   }
 
   // Inputs shard args details.
   nb::list input_indices = pmap_data.attr("input_indices");
 
   cache_entry.py_devices = pmap_data.attr("input_devices");
-  auto input_devices = py::cast<std::vector<xla::PjRtDevice*>>(
-      py::handle(pmap_data.attr("input_devices").ptr()));
+  auto input_devices = nb::cast<std::vector<xla::nb_class_ptr<xla::PyDevice>>>(
+      pmap_data.attr("input_devices"));
 
   nb::list input_array_shardings = pmap_data.attr("input_array_shardings");
 
@@ -552,16 +551,19 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   absl::Span<PyObject* const> positional_args(args, num_positional_args);
   absl::Span<PyObject* const> keyword_args(args + num_positional_args,
                                            num_keyword_args);
-  ParsedArgumentsAsBuffers arguments;
+  CallSignature call_signature;
+  absl::InlinedVector<nb::object, 2> flat_dynamic_args;
+  std::vector<nb::object> keep_alive_objects;
   absl::Status status =
       ParseArguments(positional_args, keyword_args, kwnames, static_argnums_,
-                     /*static_argnames=*/{}, pytree_registry_.get(), arguments);
+                     /*static_argnames=*/{}, pytree_registry_.get(),
+                     call_signature.arg_signature, flat_dynamic_args);
   if (!status.ok()) {
     VLOG(2) << "ParseArguments failed: " << status;
     return fallback_to_cache_miss();
   }
 
-  status = UpdateArgsSignature(arguments);
+  status = ComputeCallSignature(flat_dynamic_args, call_signature);
   if (!status.ok()) {
     return fallback_to_cache_miss();
   }
@@ -571,7 +573,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
       it;
   bool inserted;
   std::tie(it, inserted) = executables_.try_emplace(
-      arguments.signature, std::unique_ptr<PmapCacheEntry>());
+      call_signature, std::unique_ptr<PmapCacheEntry>());
   if (inserted) {
     it->second = std::make_unique<PmapCacheEntry>(pytree_registry_.get());
   }
@@ -583,7 +585,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
     if (inserted) {
       nb::object out_and_fastpath_data;
       nb::tuple out_tuple;
-      VLOG(2) << "Cache miss for " << arguments.signature.DebugString();
+      VLOG(2) << "Cache miss for " << call_signature.DebugString();
       try {
         // Calls Python and may release the GIL. May also throw if
         // compilation/tracing fails.
@@ -593,7 +595,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
         }
         out_tuple = nb::cast<nb::tuple>(out_and_fastpath_data);
 
-        PopulateCacheEntry(cache_entry, arguments.signature, out_tuple);
+        PopulateCacheEntry(cache_entry, out_tuple);
       } catch (const std::exception& e) {
         cache_entry.fall_back_to_python = true;
         cache_entry.compilation_complete.Notify();
@@ -619,20 +621,19 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   // 1. Parse arguments.
   std::vector<xla::PjRtDevice*>& input_devices = cache_entry.devices;
   std::vector<InputSpec>& input_specs = cache_entry.input_specs;
-  const int num_args = arguments.flat_dynamic_args.size();
+  const int num_args = flat_dynamic_args.size();
 
   // We need [num_args] for the `Execute` call below.
   std::vector<tsl::RCReference<xla::ifrt::Array>> num_args_arrays(num_args);
   for (int i = 0; i < num_args; ++i) {
     TF_ASSIGN_OR_RETURN(
         ShardArgResult sharded_arg,
-        ShardArg(arguments.flat_dynamic_args[i].ptr(), input_devices,
-                 input_specs[i], cache_entry.py_devices,
-                 python_shard_arg_fallback_));
+        ShardArg(flat_dynamic_args[i].ptr(), input_devices, input_specs[i],
+                 cache_entry.py_devices, python_shard_arg_fallback_));
 
     num_args_arrays[i] = std::move(sharded_arg.ifrt_array);
     if (sharded_arg.owning_sda) {
-      arguments.keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
+      keep_alive_objects.push_back(std::move(sharded_arg.owning_sda));
     }
   }
 
@@ -654,7 +655,7 @@ absl::StatusOr<nb::object> PmapFunction::Call(nb::handle callable,
   // we access them from Python.
   auto traceback = xla::Traceback::Get();
   // TODO(jblespiau): Change the `client` function to return a reference.
-  std::shared_ptr<xla::PyClient> client = cache_entry.executable->client();
+  xla::nb_class_ptr<xla::PyClient> client = cache_entry.executable->client();
 
   // Convert the PjRtBuffer objects to PyBuffer, and invert the order from
   // [num_devices, num_args] to [num_args, num_devices].
@@ -747,12 +748,6 @@ PyObject* JaxPmapFunction_tp_vectorcall(PyObject* callable,
       return nullptr;
     }
     return out.value().release().ptr();
-  } catch (py::error_already_set& e) {
-    e.restore();
-    return nullptr;
-  } catch (py::cast_error& e) {
-    PyErr_SetString(PyExc_ValueError, e.what());
-    return nullptr;
   } catch (nb::python_error& e) {
     e.restore();
     return nullptr;
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 7f180bf20e4f79..204e3d942b0a25 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/xplane_to_profile_instructions.h"
 #include "tsl/platform/macros.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/profiler/lib/profiler_factory.h"
 #include "tsl/profiler/lib/profiler_interface.h"
 #include "tsl/profiler/lib/profiler_session.h"
@@ -132,6 +133,25 @@ struct ProfilerSessionWrapper {
   std::unique_ptr<tsl::ProfilerSession> session;
 };
 
+static std::string GetFdoProfile(const std::string& xspace,
+                                 bool as_textproto = false) {
+  tensorflow::profiler::XSpace xspace_proto;
+  // TODO(phawkins): change to std::string_view when protobuf is
+  // updated in XLA.
+  xspace_proto.ParseFromString(std::string(xspace.c_str(), xspace.size()));
+  tensorflow::profiler::ProfiledInstructionsProto fdo_profile;
+  xla::ThrowIfError(xla::ConvertXplaneToProfiledInstructionsProto(
+      {xspace_proto}, &fdo_profile));
+  if (as_textproto) {
+    std::string textproto;
+    if (tsl::protobuf::TextFormat::PrintToString(fdo_profile, &textproto)) {
+      return textproto;
+    }
+    throw xla::XlaRuntimeError("Unable to serialize format to textproto");
+  }
+  return fdo_profile.SerializeAsString();
+}
+
 void BuildProfilerSubmodule(nb::module_& m) {
   nb::module_ profiler =
       m.def_submodule("profiler", "TensorFlow profiler integration");
@@ -265,16 +285,16 @@ void BuildProfilerSubmodule(nb::module_& m) {
       },
       nb::arg("tensorboard_dir"));
 
-  profiler.def("get_fdo_profile", [](nb::bytes xspace) -> nb::bytes {
-    tensorflow::profiler::XSpace xspace_proto;
-    // TODO(phawkins): change to std::string_view when protobuf is
-    // updated in XLA.
-    xspace_proto.ParseFromString(std::string(xspace.c_str(), xspace.size()));
-    tensorflow::profiler::ProfiledInstructionsProto fdo_profile;
-    xla::ThrowIfError(xla::ConvertXplaneToProfiledInstructionsProto(
-        {xspace_proto}, &fdo_profile));
-    std::string fdo_profile_str = fdo_profile.SerializeAsString();
-    return nb::bytes(fdo_profile_str.data(), fdo_profile_str.size());
+  profiler.def("get_fdo_profile",
+               [](nb::bytes xspace, bool as_textproto = false) -> nb::object {
+                 std::string out = GetFdoProfile(
+                     std::string(xspace.c_str(), xspace.size()), as_textproto);
+                 return nb::bytes(out.data(), out.size());
+               });
+
+  profiler.def("get_fdo_profile", [](nb::bytes xspace) -> nb::object {
+    std::string out = GetFdoProfile(std::string(xspace.c_str(), xspace.size()));
+    return nb::bytes(out.data(), out.size());
   });
 }
 
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 0d9ed5cad4a3ec..127a5b54638013 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,7 +1,7 @@
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index 1d3121484085b5..28b5242cca6b7f 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -42,9 +42,8 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/pjrt/exceptions.h"
@@ -62,12 +61,14 @@ limitations under the License.
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/shape.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/sharding.h"
@@ -93,7 +94,6 @@ namespace xla {
 namespace {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 PjRtBuffer* GetPjrtBuffer(ifrt::Array* ifrt_array) {
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
@@ -220,7 +220,10 @@ ifrt::MemoryKind CreateIfRtMemoryKindFromSharding(const nb::object& sharding) {
 
 struct PyArrayObject {
   PyObject_HEAD;
+#if PY_VERSION_HEX < 0x030C0000
   PyObject* weakrefs;
+  PyObject* dict;
+#endif  // PY_VERSION_HEX < 0x030B0000
   alignas(PyArray::Storage) char array_storage[sizeof(PyArray::Storage)];
 };
 static_assert(std::is_standard_layout<PyArrayObject>::value);
@@ -240,14 +243,15 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
   PyTypeObject* tp = Py_TYPE(self);
   auto* obj = reinterpret_cast<PyArrayObject*>(self);
 
-  if (obj->weakrefs) {
-    PyObject_ClearWeakRefs(self);
-  }
-
   GetPyArrayStorageFromObject(obj)->~PyArray_Storage();
 
+  PyObject_ClearWeakRefs(self);
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
+#else
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
 
   tp->tp_free(self);
   Py_DECREF(tp);
@@ -256,40 +260,26 @@ extern "C" void PyArray_tp_dealloc(PyObject* self) {
 // dynamic_attr: Allow the garbage collector to traverse the internal instance
 // `__dict__`.
 extern "C" int PyArray_tp_traverse(PyObject* self, visitproc visit, void* arg) {
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_VISIT(dict);
-// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
-#if PY_VERSION_HEX >= 0x03090000
+#else
+  _PyObject_VisitManagedDict(self, visit, arg);
+#endif  // PY_VERSION_HEX < 0x030C0000
+  // https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
   Py_VISIT(Py_TYPE(self));
-#endif
   return 0;
 }
 
 // dynamic_attr: Allow the GC to clear the dictionary.
 extern "C" int PyArray_tp_clear(PyObject* self) {
+#if PY_VERSION_HEX < 0x030C0000
   PyObject*& dict = *_PyObject_GetDictPtr(self);
   Py_CLEAR(dict);
-  return 0;
-}
-
-// Give instances of this type a `__dict__` and opt into garbage collection.
-void EnableDynamicAttribute(PyHeapTypeObject* heap_type) {
-  auto* type = &heap_type->ht_type;
-  type->tp_flags |= Py_TPFLAGS_HAVE_GC;
-#if PY_VERSION_HEX < 0x030B0000
-  type->tp_dictoffset = type->tp_basicsize;  // place dict at the end
-  type->tp_basicsize +=
-      (ssize_t)sizeof(PyObject*);  // and allocate enough space for it
 #else
-  type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
-#endif
-  type->tp_traverse = PyArray_tp_traverse;
-  type->tp_clear = PyArray_tp_clear;
-
-  static PyGetSetDef getset[] = {{"__dict__", PyObject_GenericGetDict,
-                                  PyObject_GenericSetDict, nullptr, nullptr},
-                                 {nullptr, nullptr, nullptr, nullptr, nullptr}};
-  type->tp_getset = getset;
+  _PyObject_ClearManagedDict(self);
+#endif  // PY_VERSION_HEX < 0x030C0000
+  return 0;
 }
 
 template <typename... Args>
@@ -355,7 +345,7 @@ PyArray_Storage::PyArray_Storage(nb::object aval, bool weak_type,
                                  xla::nb_dtype dtype,
                                  std::vector<int64_t> shape,
                                  nb::object sharding, bool committed,
-                                 std::shared_ptr<PyClient> py_client,
+                                 nb_class_ptr<PyClient> py_client,
                                  std::optional<nb_traceback> traceback,
                                  tsl::RCReference<ifrt::Array> ifrt_array,
                                  xla::PjRtFuture<absl::Status> result_status)
@@ -404,7 +394,7 @@ void PyArray::PyInit(nb::object self, DisableFastpath) {
 }
 
 PyArray PyArray::MakeFromSingleDeviceArray(
-    std::shared_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
     xla::PjRtFuture<absl::Status> result_status) {
   if (!llvm::isa<ifrt::SingleDeviceSharding>(ifrt_array->sharding())) {
@@ -434,7 +424,7 @@ PyArray PyArray::MakeFromSingleDeviceArray(
 }
 
 PyArray PyArray::MakeFromIfrtArrayAndSharding(
-    std::shared_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
+    nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
     tsl::RCReference<ifrt::Array> ifrt_array, nb::object sharding,
     bool weak_type, bool committed, bool skip_checks) {
   auto shape_span = ifrt_array->shape().dims();
@@ -469,8 +459,7 @@ PyArray PyArrayResultHandler::Call(absl::Span<const PyArray> py_arrays) const {
 }
 
 PyArray PyArrayResultHandler::Call(
-    std::shared_ptr<PyClient> py_client,
-    tsl::RCReference<ifrt::Array> ifrt_array,
+    nb_class_ptr<PyClient> py_client, tsl::RCReference<ifrt::Array> ifrt_array,
     xla::PjRtFuture<absl::Status> result_status) const {
   return PyArray(aval_, weak_type_, dtype_, shape_, sharding_,
                  std::move(py_client), Traceback::Get(), std::move(ifrt_array),
@@ -484,7 +473,7 @@ PyArray PyArrayResultHandler::Call(PyArray py_array) const {
 
 PyArray::PyArray(nb::object aval, bool weak_type, nb_dtype dtype,
                  std::vector<int64_t> shape, nb::object sharding,
-                 std::shared_ptr<PyClient> py_client,
+                 nb_class_ptr<PyClient> py_client,
                  std::optional<nb_traceback> traceback,
                  tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
                  bool skip_checks,
@@ -583,7 +572,7 @@ Status PyArray::set_arrays(nb::object obj) {
   for (nb::handle obj : list) {
     if (obj.type().is(PyArray::type())) {
       auto py_array = nb::borrow<PyArray>(obj);
-      if (py_array.py_client() != py_client()) {
+      if (py_array.py_client().get() != py_client().get()) {
         return InvalidArgument("Client mismatch when assigning to _arrays.");
       }
       if (py_array.num_shards() != 1) {
@@ -813,8 +802,8 @@ nb::dict PyArray::CudaArrayInterface() {
   return result;
 }
 
-StatusOr<nb::object> CudaArrayInterfaceToBuffer(
-    const nb::dict& cai, std::shared_ptr<PyClient> client) {
+StatusOr<nb::object> CudaArrayInterfaceToBuffer(const nb::dict& cai,
+                                                nb_class_ptr<PyClient> client) {
 #ifndef GOOGLE_CUDA
   throw XlaRuntimeError("This operation requires CUDA support.");
 #else
@@ -912,8 +901,9 @@ StatusOr<nb::object> CudaArrayInterfaceToBuffer(
   std::function<void()> on_delete_callback = []() {};
   TF_ASSIGN_OR_RETURN(
       auto pjrt_buffer,
-      device->client()->CreateViewOfDeviceBuffer(
-          static_cast<char*>(data_ptr), shape, device.get(), on_delete_callback,
+      device->client()->pjrt_client()->CreateViewOfDeviceBuffer(
+          static_cast<char*>(data_ptr), shape, device->device(),
+          on_delete_callback,
           stream <= 2 ? std::nullopt : std::make_optional(stream)));
   auto* ifrt_client =
       llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
@@ -1065,7 +1055,7 @@ StatusOr<PyArray> PyArray::CopyToDeviceWithSharding(ifrt::DeviceList devices,
 
 StatusOr<PyArray> PyArray::BatchedDevicePut(
     nb::object aval, nb::object sharding, std::vector<nb::object> xs,
-    std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+    absl::Span<const PyDevice* const> dst_devices, bool committed,
     bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
     bool jax_enable_x64) {
   if (dst_devices.size() != xs.size() || xs.empty()) {
@@ -1075,8 +1065,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
                      dst_devices.size(), xs.size())
             .c_str());
   }
-  for (ClientAndPtr<PjRtDevice>& device : dst_devices) {
-    if (device.get_client() == nullptr) {
+  for (const PyDevice* device : dst_devices) {
+    if (device->client().get() == nullptr) {
       return InvalidArgument("Cannot copy to unattached devices.");
     }
   }
@@ -1117,8 +1107,8 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
     }
     TF_ASSIGN_OR_RETURN(
         DevicePutResult on_device,
-        DevicePut(x, dst_devices[i].get_client()->ifrt_client(),
-                  dst_devices[i].get(), options, dst_memory_kind));
+        DevicePut(x, dst_devices[i]->client()->ifrt_client(),
+                  dst_devices[i]->device(), options, dst_memory_kind));
     ifrt_arrays.push_back(std::move(on_device.ifrt_array));
     devices.push_back(ifrt_arrays.back()->sharding().devices().front());
     shapes.push_back(ifrt_arrays.back()->shape());
@@ -1146,7 +1136,7 @@ StatusOr<PyArray> PyArray::BatchedDevicePut(
           xla::ifrt::ArrayCopySemantics::kReuseInput));
 
   return PyArray(aval, weak_type, dtype, std::move(shape), sharding,
-                 dst_devices[0].client(), Traceback::Get(),
+                 dst_devices[0]->client(), Traceback::Get(),
                  std::move(ifrt_array), committed, /*skip_checks=*/true);
 }
 
@@ -1210,6 +1200,12 @@ struct ExtraBufferInfo {
   std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
 };
 
+// The default layout of a non-tuple array should have major-to-minor layout
+// and no tiles.
+bool HasDefaultLayout(const Layout& layout) {
+  return LayoutUtil::IsMonotonicWithDim0Major(layout) && layout.tiles().empty();
+}
+
 int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
   Status status = [&]() {
     PyArray py_array = nb::borrow<PyArray>(exporter);
@@ -1287,6 +1283,12 @@ int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
                !LayoutUtil::IsMonotonicWithDim0Major(xla_layout) &&
                !LayoutUtil::IsMonotonicWithDim0Minor(xla_layout)) {
       return InvalidArgument("Buffer is not in contiguous layout.");
+    } else if (!HasDefaultLayout(xla_layout)) {
+      // Fail and fall back to using __array__ if the CPU buffer has a device
+      // specific layout. For instance, this happens for host buffers in pinned
+      // memories of the TPU device.
+      return InvalidArgument(
+          "Buffer is potentially a device buffer with non default layout.");
     }
     std::memset(view, 0, sizeof(Py_buffer));
     const void* root_ptr =
@@ -1336,13 +1338,6 @@ void PyArray_bf_releasebuffer(PyObject*, Py_buffer* buffer) {
   delete extra;
 }
 
-PyBufferProcs PyArray_tp_as_buffer = []() {
-  PyBufferProcs procs;
-  procs.bf_getbuffer = &PyArray_bf_getbuffer;
-  procs.bf_releasebuffer = &PyArray_bf_releasebuffer;
-  return procs;
-}();
-
 // Returns if shape has a major-to-minor layout.
 bool HasMajorToMinorLayout(const xla::Shape& shape) {
   if (shape.has_layout()) {
@@ -1365,6 +1360,18 @@ std::optional<std::vector<int64_t>> ByteStridesOrDefaultForShapeInt64(
   return ByteStridesForShape(shape);
 }
 
+bool IsZeroCopyableCpuBuffer(const PjRtBuffer* buf) {
+  // For CPU buffers with device-specific layouts, we must delinearize
+  // to unpack the array. This could happen for the host buffer
+  // pre-mapped to the TPU device, a.k.a., pinned host buffers for the
+  // device.
+  bool has_default_layout = buf->layout() == nullptr ||
+                            HasDefaultLayout(GetXlaLayoutUnsafe(buf->layout()));
+  // On CPU for non-int4 values, we can return the value in a zero-copy way.
+  // For int4 values, we must copy in order to unpack the array.
+  return buf->IsOnCpu() && !primitive_util::Is4BitType(buf->element_type()) &&
+         has_default_layout;
+}
 }  // namespace
 
 PyHostValue::PyHostValue() = default;
@@ -1381,8 +1388,7 @@ StatusOr<nb::object> PyHostValue::AsNumPyArray(
     TF_RET_CHECK(!pjrt_buffer->IsTuple());
     // On CPU for non-int4 values, we can return the value in a zero-copy way.
     // For int4 values, we must copy in order to unpack the array.
-    if (pjrt_buffer->IsOnCpu() &&
-        !primitive_util::Is4BitType(pjrt_buffer->element_type())) {
+    if (IsZeroCopyableCpuBuffer(pjrt_buffer)) {
       TF_ASSIGN_OR_RETURN(const auto* shape,
                           XlaDynamicShape(ifrt_array, dynamic_shape_holder));
       TF_ASSIGN_OR_RETURN(nb_dtype dtype,
@@ -1429,12 +1435,9 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
     return OkStatus();
   }
   auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array);
-  if (arr != nullptr) {
-    auto* pjrt_buffer = arr->pjrt_buffers().front().get();
-    if (pjrt_buffer->IsOnCpu() &&
-        !primitive_util::Is4BitType(pjrt_buffer->element_type())) {
-      return OkStatus();
-    }
+  if (arr != nullptr && !arr->pjrt_buffers().front()->IsTuple() &&
+      IsZeroCopyableCpuBuffer(arr->pjrt_buffers().front().get())) {
+    return OkStatus();
   }
   auto transfer_guard_formatter = [ifrt_array] {
     return absl::StrCat(
@@ -1486,50 +1489,59 @@ Status PyHostValue::CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
   return OkStatus();
 }
 
-Status PyArray::SetUpType() {
-  static constexpr char kName[] = "ArrayImpl";
-
-  nb::str name(kName);
-  nb::str qualname(kName);
-
-  auto* heap_type = reinterpret_cast<PyHeapTypeObject*>(
-      PyType_Type.tp_alloc(&PyType_Type, 0));
-  // Caution: we must not call any functions that might invoke the GC until
-  // PyType_Ready() is called below. Otherwise the GC might see a
-  // half-constructed type object.
-  if (!heap_type) {
-    return Internal("Unable to create heap type object");
-  }
-  heap_type->ht_name = name.release().ptr();
-  heap_type->ht_qualname = qualname.release().ptr();
-  PyTypeObject* type = &heap_type->ht_type;
-  type->tp_name = kName;
-  type->tp_basicsize = sizeof(PyArrayObject);
-  type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
-  type->tp_new = PyArray_tp_new;
-  type->tp_dealloc = PyArray_tp_dealloc;
-
-  // Supported protocols
-  type->tp_as_number = &heap_type->as_number;
-  type->tp_as_sequence = &heap_type->as_sequence;
-  type->tp_as_mapping = &heap_type->as_mapping;
-  type->tp_as_buffer = &PyArray_tp_as_buffer;
-
-  // Allow dynamic attributes.
-  EnableDynamicAttribute(heap_type);
-
-  // Allow weak references to DeviceArray objects.
-  type->tp_weaklistoffset = offsetof(PyArrayObject, weakrefs);
-
-  TF_RET_CHECK(PyType_Ready(type) == 0);
+namespace {
+PyGetSetDef PyArray_tp_getset[] = {
+    {"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr,
+     nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr},
+};
 
-  PyArray::type_ = reinterpret_cast<PyObject*>(type);
+PyMemberDef PyArray_members[] = {
+#if PY_VERSION_HEX < 0x030C0000
+    {"__weaklistoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PyArrayObject, weakrefs)), READONLY,
+     nullptr},
+    {"__dictoffset__", T_PYSSIZET,
+     static_cast<Py_ssize_t>(offsetof(PyArrayObject, dict)), READONLY, nullptr},
+#endif  // PY_VERSION_HEX < 0x030C0000
+    {nullptr, 0, 0, 0, nullptr},
+};  // namespace xla
+
+PyType_Slot PyArray_slots[] = {
+    {Py_tp_new, reinterpret_cast<void*>(PyArray_tp_new)},
+    {Py_tp_dealloc, reinterpret_cast<void*>(PyArray_tp_dealloc)},
+    {Py_tp_members, reinterpret_cast<void*>(PyArray_members)},
+    {Py_tp_traverse, reinterpret_cast<void*>(PyArray_tp_traverse)},
+    {Py_tp_clear, reinterpret_cast<void*>(PyArray_tp_clear)},
+    {Py_tp_getset, reinterpret_cast<void*>(PyArray_tp_getset)},
+    {Py_bf_getbuffer, reinterpret_cast<void*>(PyArray_bf_getbuffer)},
+    {Py_bf_releasebuffer, reinterpret_cast<void*>(PyArray_bf_releasebuffer)},
+    {0, nullptr},
+};
 
-  return OkStatus();
-}
+}  // namespace
 
 Status PyArray::RegisterTypes(nb::module_& m) {
-  TF_RETURN_IF_ERROR(PyArray::SetUpType());
+  std::string name =
+      absl::StrCat(nb::cast<std::string>(m.attr("__name__")), ".ArrayImpl");
+
+  PyType_Spec PyArray_spec = {
+      /*.name=*/name.c_str(),
+      /*.basicsize=*/static_cast<int>(sizeof(PyArrayObject)),
+      /*.itemsize=*/0,
+#if PY_VERSION_HEX < 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+#else   // PY_VERSION_HEX >= 0x030C0000
+      /*.flags=*/Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC |
+          Py_TPFLAGS_MANAGED_DICT | Py_TPFLAGS_MANAGED_WEAKREF,
+#endif  // PY_VERSION_HEX >= 0x030C0000
+      /*.slots=*/PyArray_slots,
+  };
+
+  type_ = PyType_FromSpec(&PyArray_spec);
+  if (!type_) {
+    throw nb::python_error();
+  }
   auto type = nb::borrow<nb::object>(type_);
   m.attr("ArrayImpl") = type;
 
@@ -1579,6 +1591,8 @@ Status PyArray::RegisterTypes(nb::module_& m) {
       nb::is_method());
   type.attr("__cuda_array_interface__") = nb_property_readonly(
       [](PyArray self) { return self.CudaArrayInterface(); });
+  type.attr("_pjrt_layout") =
+      nb_property_readonly(xla::ValueOrThrowWrapper(&PyArray::layout));
   type.attr("on_device_size_in_bytes") = nb::cpp_function(
       xla::ValueOrThrowWrapper(&PyArray::GetOnDeviceSizeInBytes),
       nb::is_method());
@@ -1616,13 +1630,12 @@ Status PyArray::RegisterTypes(nb::module_& m) {
   type.attr("__module__") = m.attr("__name__");
 
   m.attr("copy_array_to_devices_with_sharding") = nb::cpp_function(
-      [](PyArray self, nb::object dst_devices_py, nb::object sharding) {
-        auto dst_devices = py::cast<std::vector<ClientAndPtr<PjRtDevice>>>(
-            py::handle(dst_devices_py.ptr()));
+      [](PyArray self, absl::Span<const PyDevice* const> dst_devices,
+         nb::object sharding) {
         ifrt::DeviceList::Devices devices;
         devices.reserve(dst_devices.size());
         for (auto& d : dst_devices) {
-          devices.push_back(d.get());
+          devices.push_back(d->device());
         }
         return xla::ValueOrThrow(self.CopyToDeviceWithSharding(
             ifrt::DeviceList(devices), std::move(sharding)));
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index b2883f0292e7c9..f5b39e86e85853 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -31,15 +31,12 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/future.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_numpy.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/py_client.h"
@@ -67,7 +64,7 @@ class PyHostValue {
   Status CopyToHostAsync(std::optional<Shape>& dynamic_shape_holder,
                          ifrt::Array* ifrt_array);
 
-  StatusOr<nanobind::object> AsNumPyArray(
+  absl::StatusOr<nanobind::object> AsNumPyArray(
       std::optional<Shape>& dynamic_shape_holder, ifrt::Array* ifrt_array);
 
  private:
@@ -79,7 +76,7 @@ class PyHostValue {
 struct PyArray_Storage {
   PyArray_Storage(nanobind::object aval, bool weak_type, nb_dtype dtype,
                   std::vector<int64_t> shape, nanobind::object sharding,
-                  bool committed, std::shared_ptr<PyClient> py_client,
+                  bool committed, nb_class_ptr<PyClient> py_client,
                   std::optional<nb_traceback> traceback,
                   tsl::RCReference<ifrt::Array> ifrt_array,
                   xla::PjRtFuture<absl::Status> result_status);
@@ -103,7 +100,7 @@ struct PyArray_Storage {
   nanobind::object npy_value = nanobind::none();
   bool committed = false;
 
-  std::shared_ptr<PyClient> py_client;
+  nb_class_ptr<PyClient> py_client;
   std::optional<nb_traceback> traceback;
   tsl::RCReference<ifrt::Array> ifrt_array;
 
@@ -147,7 +144,7 @@ class PyArray : public nanobind::object {
   // checked.
   PyArray(nanobind::object aval, bool weak_type, nb_dtype dtype,
           std::vector<int64_t> shape, nanobind::object sharding,
-          std::shared_ptr<PyClient> py_client,
+          nb_class_ptr<PyClient> py_client,
           std::optional<nb_traceback> traceback,
           tsl::RCReference<ifrt::Array> ifrt_array, bool committed,
           bool skip_checks,
@@ -155,15 +152,13 @@ class PyArray : public nanobind::object {
               xla::PjRtFuture<absl::Status>());
 
   static PyArray MakeFromSingleDeviceArray(
-      std::shared_ptr<PyClient> py_client,
-      std::optional<nb_traceback> traceback,
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, bool weak_type, bool committed,
       xla::PjRtFuture<absl::Status> result_status =
           xla::PjRtFuture<absl::Status>());
 
   static PyArray MakeFromIfrtArrayAndSharding(
-      std::shared_ptr<PyClient> py_client,
-      std::optional<nb_traceback> traceback,
+      nb_class_ptr<PyClient> py_client, std::optional<nb_traceback> traceback,
       tsl::RCReference<ifrt::Array> ifrt_array, nanobind::object sharding,
       bool weak_type, bool committed, bool skip_checks);
 
@@ -181,6 +176,10 @@ class PyArray : public nanobind::object {
 
   const nanobind::object& sharding() const { return GetStorage().sharding; }
 
+  absl::StatusOr<std::unique_ptr<PjRtLayout>> layout() {
+    return ifrt_array()->layout();
+  }
+
   bool committed() const { return GetStorage().committed; }
 
   const nanobind::object& npy_value() const { return GetStorage().npy_value; }
@@ -188,7 +187,7 @@ class PyArray : public nanobind::object {
     GetStorage().npy_value = std::move(v);
   }
 
-  const std::shared_ptr<PyClient>& py_client() const {
+  const nb_class_ptr<PyClient>& py_client() const {
     return GetStorage().py_client;
   }
 
@@ -198,7 +197,7 @@ class PyArray : public nanobind::object {
 
   // Returns xla::InvalidArgument if the buffer has been deleted.
   // See `PjRtFuture` for the semantics of `IsReady` and `IsKnownReady`.
-  StatusOr<bool> IsReady() {
+  absl::StatusOr<bool> IsReady() {
     ifrt::Array* ifrt_array_ptr = ifrt_array();
     if (ifrt_array_ptr->IsDeleted()) {
       return InvalidArgument("Array has been deleted.");
@@ -250,7 +249,7 @@ class PyArray : public nanobind::object {
 
   nanobind::object arrays();
   Status set_arrays(nanobind::object obj);
-  StatusOr<PyArray> FullyReplicatedShard();
+  absl::StatusOr<PyArray> FullyReplicatedShard();
 
   int num_shards() const {
     ifrt::Array* ifrt_array_ptr = ifrt_array();
@@ -276,11 +275,11 @@ class PyArray : public nanobind::object {
 
   absl::Status BlockUntilResultStatusIsReady();
 
-  StatusOr<size_t> GetOnDeviceSizeInBytes();
-  StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
+  absl::StatusOr<size_t> GetOnDeviceSizeInBytes();
+  absl::StatusOr<nanobind::object> SingleDeviceArrayToNumpyArray();
   Status CopySingleDeviceArrayToHostAsync();
   nanobind::dict CudaArrayInterface();
-  StatusOr<std::uintptr_t> UnsafeBufferPointer();
+  absl::StatusOr<std::uintptr_t> UnsafeBufferPointer();
 
   Status Delete();
 
@@ -288,13 +287,13 @@ class PyArray : public nanobind::object {
 
   PyArray Clone() const;
 
-  StatusOr<PyArray> CopyToDeviceWithSharding(ifrt::DeviceList devices,
-                                             nanobind::object dst_sharding);
+  absl::StatusOr<PyArray> CopyToDeviceWithSharding(
+      ifrt::DeviceList devices, nanobind::object dst_sharding);
 
-  static StatusOr<PyArray> BatchedDevicePut(
+  static absl::StatusOr<PyArray> BatchedDevicePut(
       nanobind::object aval, nanobind::object sharding,
       std::vector<nanobind::object> xs,
-      std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+      absl::Span<const PyDevice* const> dst_devices, bool committed,
       bool force_copy, PjRtClient::HostBufferSemantics host_buffer_semantics,
       bool jax_enable_x64);
 
@@ -302,8 +301,8 @@ class PyArray : public nanobind::object {
       std::vector<nanobind::object> objs);
 
  private:
-  StatusOr<PyArray> FetchSingleShard(std::string_view api);
-  StatusOr<PyArray> AssertUnsharded(std::string_view api);
+  absl::StatusOr<PyArray> FetchSingleShard(std::string_view api);
+  absl::StatusOr<PyArray> AssertUnsharded(std::string_view api);
 
   void CheckAndRearrange();
 
@@ -312,8 +311,6 @@ class PyArray : public nanobind::object {
   Storage& GetStorage();
   const Storage& GetStorage() const;
 
-  static Status SetUpType();
-
   inline static PyObject* type_ = nullptr;
 };
 
@@ -325,7 +322,7 @@ class PyArrayResultHandler {
   PyArray Call(absl::Span<const PyArray> py_arrays) const;
   PyArray Call(PyArray py_array) const;
 
-  PyArray Call(std::shared_ptr<PyClient> py_client,
+  PyArray Call(nb_class_ptr<PyClient> py_client,
                tsl::RCReference<ifrt::Array> ifrt_array,
                xla::PjRtFuture<absl::Status> result_status =
                    xla::PjRtFuture<absl::Status>()) const;
@@ -341,8 +338,8 @@ class PyArrayResultHandler {
   std::vector<int64_t> shape_;
 };
 
-StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
-    const nanobind::dict& cai, std::shared_ptr<PyClient> cuda_client);
+absl::StatusOr<nanobind::object> CudaArrayInterfaceToBuffer(
+    const nanobind::dict& cai, nb_class_ptr<PyClient> cuda_client);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/py_client.cc b/third_party/xla/xla/python/py_client.cc
index 298ba93be5d381..d0389018940891 100644
--- a/third_party/xla/xla/python/py_client.cc
+++ b/third_party/xla/xla/python/py_client.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -38,6 +40,12 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/pair.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
 #include "xla/literal.h"
 #include "xla/pjrt/exceptions.h"
@@ -46,6 +54,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_stream_executor_client.h"
+#include "xla/pjrt/status_casters.h"
 #include "xla/python/callback.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/compiler.h"
@@ -53,13 +62,17 @@ limitations under the License.
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/host_callback.h"
 #include "xla/python/ifrt/memory.h"
+#include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
 #include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/python/pprof_profile_builder.h"
 #include "xla/python/py_array.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_executable.h"
 #include "xla/python/py_host_callback.h"
+#include "xla/python/py_memory_space.h"
 #include "xla/python/py_values.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/traceback.h"
@@ -83,7 +96,13 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
+
+/*static*/ nb_class_ptr<PyClient> PyClient::Make(
+    std::shared_ptr<ifrt::Client> ifrt_client) {
+  auto client = make_nb_class<PyClient>(std::move(ifrt_client));
+  Initialize(client);
+  return client;
+}
 
 PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
     : ifrt_client_(std::move(ifrt_client)),
@@ -91,43 +110,75 @@ PyClient::PyClient(std::shared_ptr<ifrt::Client> ifrt_client)
   CHECK(ifrt_client_);
 }
 
+/* static */ void PyClient::Initialize(nb_class_ptr<PyClient> client) {
+  for (ifrt::Device* device : client->ifrt_client()->devices()) {
+    client->devices_[device] = make_nb_class<PyDevice>(client, device);
+
+    for (PjRtMemorySpace* memory : device->memory_spaces()) {
+      auto& py_memory = client->memory_spaces_[memory];
+      if (py_memory.get() == nullptr) {
+        py_memory = make_nb_class<PyMemorySpace>(client, memory);
+      }
+    }
+  }
+}
+
 PyClient::~PyClient() {
-  py::gil_scoped_release gil;
+  nb::gil_scoped_release gil;
   ifrt_client_ = nullptr;
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyClient::Devices() {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+nb_class_ptr<PyDevice> PyClient::GetPyDevice(ifrt::Device* device) {
+  auto& py_device = devices_[device];
+  if (py_device.get() == nullptr) {
+    py_device = make_nb_class<PyDevice>(
+        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), device);
+  }
+  return py_device;
+}
+
+nb_class_ptr<PyMemorySpace> PyClient::GetPyMemorySpace(
+    PjRtMemorySpace* memory_space) {
+  auto& py_memory = memory_spaces_[memory_space];
+  if (py_memory.get() == nullptr) {
+    py_memory = make_nb_class<PyMemorySpace>(
+        nb::borrow<nb_class_ptr<PyClient>>(nb::find(this)), memory_space);
+  }
+  return py_memory;
+}
+
+std::vector<nb_class_ptr<PyDevice>> PyClient::Devices() {
+  std::vector<nb_class_ptr<PyDevice>> devices;
   auto span = ifrt_client_->devices();
   devices.reserve(span.size());
   for (PjRtDevice* device : span) {
-    devices.push_back(WrapWithClient(shared_from_this(), device));
+    devices.push_back(GetPyDevice(device));
   }
   return devices;
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyClient::LocalDevices() {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+std::vector<nb_class_ptr<PyDevice>> PyClient::LocalDevices() {
+  std::vector<nb_class_ptr<PyDevice>> devices;
   devices.reserve(ifrt_client_->addressable_devices().size());
   for (ifrt::Device* device : ifrt_client_->addressable_devices()) {
-    devices.push_back(WrapWithClient(shared_from_this(), device));
+    devices.push_back(GetPyDevice(device));
   }
   return devices;
 }
 
-absl::StatusOr<ClientAndPtr<PjRtDevice>> PyClient::DeviceFromLocalHardwareId(
+absl::StatusOr<nb_class_ptr<PyDevice>> PyClient::DeviceFromLocalHardwareId(
     int local_hardware_id) {
   TF_ASSIGN_OR_RETURN(PjRtDevice * device,
                       ifrt_client_->LookupAddressableDevice(local_hardware_id));
-  return WrapWithClient(shared_from_this(), device);
+  return GetPyDevice(device);
 }
 
-std::vector<std::shared_ptr<PyLoadedExecutable>> PyClient::LiveExecutables() {
+nb::list PyClient::LiveExecutables() {
   CHECK(PyGILState_Check());
-  std::vector<std::shared_ptr<PyLoadedExecutable>> executables;
+  nb::list executables;
   for (PyLoadedExecutable* exec = executables_; exec; exec = exec->next_) {
     if (!exec->is_deleted()) {
-      executables.push_back(exec->shared_from_this());
+      executables.append(nb::find(exec));
     }
   }
   return executables;
@@ -218,28 +269,30 @@ absl::Status PyClient::Defragment() {
   return absl::OkStatus();
 }
 
-absl::StatusOr<py::object> PyClient::BufferFromPyval(
-    pybind11::handle argument, PjRtDevice* device, bool force_copy,
-    ifrt::Client::HostBufferSemantics host_buffer_semantics) {
+/* static */ absl::StatusOr<nb::object> PyClient::BufferFromPyval(
+    nb_class_ptr<PyClient> client, nb::handle argument, PjRtDevice* device,
+    bool force_copy, ifrt::Client::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
-    TF_RET_CHECK(!ifrt_client_->addressable_devices().empty());
-    device = ifrt_client_->addressable_devices().front();
+    TF_RET_CHECK(!client->ifrt_client_->addressable_devices().empty());
+    device = client->ifrt_client_->addressable_devices().front();
   }
   CHECK(device != nullptr);
 
   auto transfer_guard_formatter = [&argument, dst_device = device] {
-    auto type = py::cast<std::string>(py::str(argument.get_type()));
+    auto type = nb::cast<std::string>(nb::str(argument.type()));
     // Catch exceptions because shape and dtype properties convertible to str
     // are not guaranteed to present in an arbitrary argument.
     std::string shape;
     std::string dtype;
     try {
-      shape = py::cast<std::string>(py::str(argument.attr("shape")));
+      shape =
+          nb::cast<std::string>(nb::str(nb::object(argument.attr("shape"))));
     } catch (const std::exception& e) {
       shape = "<unknown>";
     }
     try {
-      dtype = py::cast<std::string>(py::str(argument.attr("dtype")));
+      dtype =
+          nb::cast<std::string>(nb::str(nb::object(argument.attr("dtype"))));
     } catch (const std::exception& e) {
       dtype = "<unknown>";
     }
@@ -250,11 +303,11 @@ absl::StatusOr<py::object> PyClient::BufferFromPyval(
       jax::ApplyTransferGuardToHostToDevice(transfer_guard_formatter));
 
   TF_ASSIGN_OR_RETURN(PjRtDevice * found_device,
-                      ifrt_client_->LookupDevice(device->id()));
+                      client->ifrt_client_->LookupDevice(device->id()));
   if (found_device != device) {
     return InvalidArgument("Cannot copy value to device '%s' with '%s' backend",
                            device->DebugString(),
-                           ifrt_client_->platform_name());
+                           client->ifrt_client_->platform_name());
   }
   GlobalPyRefManager()->CollectGarbage();
 
@@ -265,26 +318,23 @@ absl::StatusOr<py::object> PyClient::BufferFromPyval(
        (host_buffer_semantics == ifrt::Client::HostBufferSemantics::kZeroCopy));
   // TODO(phawkins): remove .ptr() after nanobind transition is complete.
   TF_ASSIGN_OR_RETURN(DevicePutResult put,
-                      DevicePut(argument.ptr(), ifrt_client_.get(), device,
-                                options, ifrt::MemoryKind()));
+                      DevicePut(argument.ptr(), client->ifrt_client_.get(),
+                                device, options, ifrt::MemoryKind()));
 
   if (put.ifrt_array) {
     auto traceback = Traceback::Get();
-    auto out = PyArray::MakeFromSingleDeviceArray(
-        shared_from_this(), std::move(traceback), std::move(put.ifrt_array),
+    return PyArray::MakeFromSingleDeviceArray(
+        std::move(client), std::move(traceback), std::move(put.ifrt_array),
         /*weak_type=*/false,
         /*committed=*/false);
-    // TODO(phawkins): remove after nanobind transition is complete.
-    return py::reinterpret_steal<py::object>(out.release().ptr());
   } else {
-    // TODO(phawkins): remove .ptr() after nanobind transition is complete.
-    return py::reinterpret_borrow<py::object>(put.owning_pybuffer.ptr());
+    return put.owning_pybuffer;
   }
 }
 
-absl::StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
-PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                                      PjRtDevice* device) {
+/* static */ absl::StatusOr<nb::list> PyClient::MakeCrossHostReceiveBuffers(
+    nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+    PjRtDevice* device) {
   CHECK(device != nullptr);
   absl::Mutex mu;
   absl::StatusOr<std::vector<PjRtCrossHostRecvDescriptors>> recv_descriptors_or;
@@ -292,13 +342,13 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
 
   TF_ASSIGN_OR_RETURN(
       auto buffers,
-      pjrt_client()->MakeCrossHostReceiveBuffers(
+      client->pjrt_client()->MakeCrossHostReceiveBuffers(
           shapes, device,
           [&done, &recv_descriptors_or,
            &mu](absl::StatusOr<PjRtCrossHostRecvState> recv_state_or) {
             absl::MutexLock l(&mu);
             if (recv_state_or.ok()) {
-              py::gil_scoped_acquire gil;
+              nb::gil_scoped_acquire gil;
               recv_descriptors_or = std::move(recv_state_or->descriptors);
             } else {
               recv_descriptors_or = recv_state_or.status();
@@ -307,36 +357,32 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
           }));
 
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     absl::MutexLock l(&mu);
     mu.Await(absl::Condition(&done));
   }
 
   TF_RETURN_IF_ERROR(recv_descriptors_or.status());
   CHECK_EQ(buffers.size(), recv_descriptors_or->size());
-  std::vector<std::pair<pybind11::bytes, pybind11::object>> result;
-  result.reserve(buffers.size());
+  nb::list result;
   for (int i = 0; i < buffers.size(); ++i) {
     auto& descriptors = recv_descriptors_or->at(i);
     CHECK_EQ(descriptors.serialized_descriptors.size(), 1);
     const std::string& desc = descriptors.serialized_descriptors[0];
-    pybind11::bytes py_desc = pybind11::bytes(desc);
-    auto* client =
-        llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client());
-    if (client == nullptr) {
+    nb::bytes py_desc = nb::bytes(desc.data(), desc.size());
+    auto* ifrt_client = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+        client->ifrt_client());
+    if (ifrt_client == nullptr) {
       throw XlaRuntimeError(
           "This operation is implemented for a PjRt-compatible backend only.");
     }
     TF_ASSIGN_OR_RETURN(auto ifrt_array,
-                        client->CreatePjRtArray(std::move(buffers[i])));
-    auto py_buf = PyArray::MakeFromSingleDeviceArray(
-        shared_from_this(), Traceback::Get(), std::move(ifrt_array),
-        /*weak_type=*/false,
-        /*committed=*/false);
-    // TODO(phawkins): update after nanobind transition
-    result.push_back(std::make_pair(
-        std::move(py_desc),
-        py::reinterpret_steal<py::object>(py_buf.release().ptr())));
+                        ifrt_client->CreatePjRtArray(std::move(buffers[i])));
+    auto py_buf = PyArray::MakeFromSingleDeviceArray(client, Traceback::Get(),
+                                                     std::move(ifrt_array),
+                                                     /*weak_type=*/false,
+                                                     /*committed=*/false);
+    result.append(nb::make_tuple(std::move(py_desc), std::move(py_buf)));
   }
   return result;
 }
@@ -346,7 +392,7 @@ namespace {
 // Makes IFRT `CompileOptions` from XLA `CompileOptions` and optional host
 // callbacks.
 std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
-    CompileOptions options, std::vector<pybind11::capsule> host_callbacks) {
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
   std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
       ifrt_loaded_host_callbacks;
   ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
@@ -354,8 +400,8 @@ std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
   // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
   // `PyClient::GetEmitPythonCallbackDescriptor()`.
   for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(
-        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
+        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
   }
   return std::make_unique<ifrt::XlaCompileOptions>(
       std::move(options), std::move(ifrt_loaded_host_callbacks));
@@ -364,9 +410,8 @@ std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
 // Makes IFRT `DeserializeExecutableOptions` from XLA `CompileOptions` and
 // optional host callbacks.
 std::unique_ptr<ifrt::DeserializeExecutableOptions>
-MakeIfrtDeserializeExecutableOptions(
-    std::optional<CompileOptions> options,
-    std::vector<pybind11::capsule> host_callbacks) {
+MakeIfrtDeserializeExecutableOptions(std::optional<CompileOptions> options,
+                                     std::vector<nb::capsule> host_callbacks) {
   std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
       ifrt_loaded_host_callbacks;
   ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
@@ -374,8 +419,8 @@ MakeIfrtDeserializeExecutableOptions(
   // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
   // `PyClient::GetEmitPythonCallbackDescriptor()`.
   for (auto& host_callback : host_callbacks) {
-    ifrt_loaded_host_callbacks.push_back(
-        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+    ifrt_loaded_host_callbacks.push_back(tsl::FormRef(
+        static_cast<ifrt::LoadedHostCallback*>(host_callback.data())));
   }
   return std::make_unique<ifrt::XlaDeserializeExecutableOptions>(
       std::move(options), std::move(ifrt_loaded_host_callbacks));
@@ -383,13 +428,14 @@ MakeIfrtDeserializeExecutableOptions(
 
 }  // namespace
 
-absl::StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
-    std::string mlir_module, CompileOptions options,
-    std::vector<pybind11::capsule> host_callbacks) {
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> PyClient::Compile(
+    nb_class_ptr<PyClient> client, std::string mlir_module,
+    CompileOptions options, std::vector<nb::capsule> host_callbacks) {
   // Pass allocated device memory size to compile options for pjrt compatible
   // backends.
   auto* pjrt_compatible_client =
-      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(ifrt_client_.get());
+      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(
+          client->ifrt_client_.get());
   if (pjrt_compatible_client != nullptr) {
     auto addressable_devices =
         pjrt_compatible_client->pjrt_client()->addressable_devices();
@@ -412,48 +458,51 @@ absl::StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
   auto ifrt_compile_options =
       MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks));
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     mlir::MLIRContext context;
     TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                         ParseMlirModuleString(mlir_module, context));
     TF_ASSIGN_OR_RETURN(
         ifrt_loaded_executable,
-        ifrt_client_->GetDefaultCompiler()->Compile(
+        client->ifrt_client_->GetDefaultCompiler()->Compile(
             std::make_unique<xla::ifrt::XlaProgram>(module.get()),
             std::move(ifrt_compile_options)));
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
-  return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(ifrt_loaded_executable),
+  return make_nb_class<PyLoadedExecutable>(
+      std::move(client), std::move(ifrt_loaded_executable),
       std::move(traceback), std::move(fingerprint));
 }
 
-absl::StatusOr<py::bytes> PyClient::SerializeExecutable(
+absl::StatusOr<nb::bytes> PyClient::SerializeExecutable(
     const PyLoadedExecutable& executable) const {
-  return executable.ifrt_loaded_executable()->Serialize();
+  TF_ASSIGN_OR_RETURN(auto serialized,
+                      executable.ifrt_loaded_executable()->Serialize());
+  return nb::bytes(serialized.data(), serialized.size());
 }
 
-absl::StatusOr<std::shared_ptr<PyLoadedExecutable>>
-PyClient::DeserializeExecutable(const std::string& serialized,
+/* static */ absl::StatusOr<nb_class_ptr<PyLoadedExecutable>>
+PyClient::DeserializeExecutable(nb_class_ptr<PyClient> client,
+                                nb::bytes serialized,
                                 std::optional<CompileOptions> options,
-                                std::vector<pybind11::capsule> host_callbacks) {
+                                std::vector<nb::capsule> host_callbacks) {
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
   auto ifrt_deserialize_options = MakeIfrtDeserializeExecutableOptions(
       std::move(options), std::move(host_callbacks));
   {
-    py::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(
         ifrt_loaded_executable,
-        ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
-            serialized, std::move(ifrt_deserialize_options)));
-    TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
+        client->ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
+            std::string_view(serialized.c_str(), serialized.size()),
+            std::move(ifrt_deserialize_options)));
   }
   TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   auto traceback = Traceback::Get();
-  return std::make_shared<PyLoadedExecutable>(
-      shared_from_this(), std::move(ifrt_loaded_executable),
+  return make_nb_class<PyLoadedExecutable>(
+      std::move(client), std::move(ifrt_loaded_executable),
       std::move(traceback), std::move(fingerprint));
 }
 
@@ -490,7 +539,7 @@ H AbslHashValue(H h, const HeapProfileKey& key) {
 
 }  // namespace
 
-absl::StatusOr<py::bytes> PyClient::HeapProfile() {
+absl::StatusOr<nb::bytes> PyClient::HeapProfile() {
   CHECK(PyGILState_Check());
   absl::flat_hash_set<PjRtBuffer*> buffer_set;
   absl::flat_hash_map<HeapProfileKey, int64_t> entries;
@@ -563,50 +612,49 @@ absl::StatusOr<py::bytes> PyClient::HeapProfile() {
       kind_label->set_str(buffer_string_id);
       auto* device_label = sample->add_label();
       device_label->set_key(device_string_id);
-      device_label->set_str(
-          builder.StringId(std::string(entry.first.device->DebugString())));
+      std::string device_label_str(entry.first.device->DebugString());
+      device_label->set_str(builder.StringId(device_label_str));
     } else {
       kind_label->set_str(executable_string_id);
     }
   }
-  return py::bytes(builder.profile().SerializeAsString());
+  std::string serialized = builder.profile().SerializeAsString();
+  return nb::bytes(serialized.data(), serialized.size());
 }
 
-absl::StatusOr<pybind11::object>
-PyClient::MakePythonCallbackUsingHostSendAndRecv(
-    pybind11::function callable, absl::Span<Shape const> operand_shapes,
+absl::StatusOr<nb::object> PyClient::MakePythonCallbackUsingHostSendAndRecv(
+    nb::callable callable, absl::Span<Shape const> operand_shapes,
     absl::Span<Shape const> result_shapes,
     absl::Span<uint16_t const> send_channel_ids,
-    absl::Span<uint16_t const> recv_channel_ids,
-    pybind11::function serializer) {
+    absl::Span<uint16_t const> recv_channel_ids, nb::callable serializer) {
   TF_ASSIGN_OR_RETURN(
       auto loaded_host_callback,
       PyHostSendAndRecvLoadedHostCallback::Create(
-          ifrt_client(), nb::steal<nb::callable>(callable.release().ptr()),
-          operand_shapes, result_shapes, send_channel_ids, recv_channel_ids,
-          nb::steal<nb::callable>(serializer.release().ptr())));
-  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
-    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-  });
+          ifrt_client(), std::move(callable), operand_shapes, result_shapes,
+          send_channel_ids, recv_channel_ids, std::move(serializer)));
+  nb::capsule callback_capsule(
+      loaded_host_callback.release(), [](void* ptr) noexcept {
+        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
+      });
   return callback_capsule;
 }
 
-absl::StatusOr<std::pair<uint64_t, pybind11::object>>
-PyClient::GetEmitPythonCallbackDescriptor(pybind11::function callable,
-                                          py::object operand_shapes,
-                                          py::object result_shapes) {
-  TF_ASSIGN_OR_RETURN(
-      auto loaded_host_callback,
-      PyCpuLoadedHostCallback::Create(
-          ifrt_client(), nb::steal<nb::callable>(callable.release().ptr()),
-          nb::cast<std::vector<Shape>>(nb::handle(operand_shapes.ptr())),
-          nb::cast<std::vector<Shape>>(nb::handle(result_shapes.ptr()))));
+absl::StatusOr<std::pair<uint64_t, nb::object>>
+PyClient::GetEmitPythonCallbackDescriptor(nb::callable callable,
+                                          nb::object operand_shapes,
+                                          nb::object result_shapes) {
+  TF_ASSIGN_OR_RETURN(auto loaded_host_callback,
+                      PyCpuLoadedHostCallback::Create(
+                          ifrt_client(), std::move(callable),
+                          nb::cast<std::vector<Shape>>(operand_shapes),
+                          nb::cast<std::vector<Shape>>(result_shapes)));
   const uint64_t descriptor = loaded_host_callback->descriptor();
 
-  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
-    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
-  });
-  return std::make_pair(descriptor, py::object(std::move(callback_capsule)));
+  nb::capsule callback_capsule(
+      loaded_host_callback.release(), [](void* ptr) noexcept {
+        static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
+      });
+  return std::make_pair(descriptor, nb::object(std::move(callback_capsule)));
 }
 
 XLA_CPU_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM("xla_python_cpu_callback",
@@ -618,4 +666,141 @@ XLA_REGISTER_CUSTOM_CALL_TARGET_WITH_SYM(
     absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value()));
 #endif
 
+/* static */ int PyClient::tp_traverse(PyObject* self, visitproc visit,
+                                       void* arg) {
+  PyClient* c = nb::inst_ptr<PyClient>(self);
+  for (const auto& [ifrt_device, py_device] : c->devices_) {
+    Py_VISIT(py_device.ptr());
+  }
+  for (const auto& [ifrt_memory, py_memory] : c->memory_spaces_) {
+    Py_VISIT(py_memory.ptr());
+  }
+  return 0;
+}
+
+/* static */ int PyClient::tp_clear(PyObject* self) {
+  PyClient* c = nb::inst_ptr<PyClient>(self);
+  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices;
+  std::swap(devices, c->devices_);
+  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
+      memory_spaces;
+  std::swap(memory_spaces, c->memory_spaces_);
+  return 0;
+}
+
+PyType_Slot PyClient::slots_[] = {
+    {Py_tp_traverse, (void*)PyClient::tp_traverse},
+    {Py_tp_clear, (void*)PyClient::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyClient::RegisterPythonTypes(nb::module_& m) {
+  nb::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
+      .value("IMMUTABLE_ONLY_DURING_CALL",
+             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
+      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
+             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
+      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
+
+  nb::class_<PyClient> py_local_client(m, "Client", nb::is_weak_referenceable(),
+                                       nb::type_slots(PyClient::slots_));
+  py_local_client.def_prop_ro("platform", &PyClient::platform_name)
+      .def_prop_ro("platform_version", &PyClient::platform_version)
+      .def_prop_ro("runtime_type", &PyClient::runtime_type)
+      .def("device_count", &PyClient::device_count)
+      .def("local_device_count", &PyClient::addressable_device_count)
+      .def("devices", &PyClient::Devices)
+      .def("local_devices", &PyClient::LocalDevices)
+      .def("device_from_local_hardware_id",
+           xla::ValueOrThrowWrapper(&PyClient::DeviceFromLocalHardwareId))
+      .def("live_executables", &PyClient::LiveExecutables)
+      .def("live_arrays", &PyClient::LiveArrays)
+      .def("live_buffers", &PyClient::LiveArrays)
+      .def("process_index", &PyClient::process_index)
+      .def("host_id", &PyClient::process_index)
+      .def("task_id", &PyClient::process_index)
+      .def(
+          "buffer_from_pyval",
+          [](nb_class_ptr<PyClient> client, nb::handle argument,
+             PyDevice* device, bool force_copy,
+             PjRtClient::HostBufferSemantics host_buffer_semantics) {
+            return ValueOrThrow(
+                PyClient::BufferFromPyval(std::move(client), argument,
+                                          device ? device->device() : nullptr,
+                                          force_copy, host_buffer_semantics));
+          },
+          nb::arg("argument"), nb::arg("device").none() = nullptr,
+          nb::arg("force_copy") = false,
+          nb::arg("host_buffer_semantics") =
+              PjRtClient::HostBufferSemantics::kZeroCopy)
+      .def(
+          "make_cross_host_receive_buffers",
+          [](nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+             PjRtDevice* device) {
+            return ValueOrThrow(PyClient::MakeCrossHostReceiveBuffers(
+                std::move(client), shapes, device));
+          },
+          nb::arg("shapes"), nb::arg("device"))
+      .def(
+          "compile",
+          [](nb_class_ptr<PyClient> client, nb::bytes mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::Compile(
+                std::move(client),
+                std::string(mlir_module.c_str(), mlir_module.size()),
+                std::move(options), std::move(host_callbacks)));
+          },
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def(
+          "compile",
+          [](nb_class_ptr<PyClient> client, std::string mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::Compile(
+                std::move(client), std::move(mlir_module), std::move(options),
+                std::move(host_callbacks)));
+          },
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def("serialize_executable",
+           xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
+      .def(
+          "deserialize_executable",
+          [](nb_class_ptr<PyClient> client, nb::bytes serialized,
+             std::optional<CompileOptions> options,
+             std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(PyClient::DeserializeExecutable(
+                std::move(client), std::move(serialized), std::move(options),
+                std::move(host_callbacks)));
+          },
+          nb::arg("serialized"), nb::arg("compile_options").none() = nb::none(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
+      // TODO(zhangqiaorjc): Experimental.
+      .def("defragment",
+           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
+      .def("get_emit_python_callback_descriptor",
+           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
+           nb::arg("callable"), nb::arg("operand_shapes"),
+           nb::arg("result_shapes").none() = nb::none())
+      .def("make_python_callback_from_host_send_and_recv",
+           xla::ValueOrThrowWrapper(
+               &PyClient::MakePythonCallbackUsingHostSendAndRecv),
+           nb::arg("callable"), nb::arg("operand_shapes"),
+           nb::arg("result_shapes"), nb::arg("send_channel_ids"),
+           nb::arg("recv_channel_ids"),
+           nb::arg("serializer").none() = nb::none())
+      .def("__getattr__",
+           [](PyClient& client, std::string_view name) -> nb::object {
+             const auto& attrs = client.attributes();
+             auto it = attrs.find(name);
+             if (it != attrs.end()) {
+               return std::visit([](auto&& v) { return nb::cast(v); },
+                                 it->second);
+             }
+             throw nb::attribute_error(
+                 absl::StrCat("Unknown attribute ", name).c_str());
+           });
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/python/py_client.h b/third_party/xla/xla/python/py_client.h
index 173ffdf6e9c25b..312008eb781dac 100644
--- a/third_party/xla/xla/python/py_client.h
+++ b/third_party/xla/xla/python/py_client.h
@@ -16,99 +16,48 @@ limitations under the License.
 #ifndef XLA_PYTHON_PY_CLIENT_H_
 #define XLA_PYTHON_PY_CLIENT_H_
 
+#include <Python.h>
+
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/client/xla_builder.h"
 #include "xla/pjrt/exceptions.h"
 #include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_client.h"
-#include "xla/statusor.h"
-#include "xla/types.h"
+#include "xla/shape.h"
 
 namespace xla {
 
 class PyClient;
 class PyLoadedExecutable;
 class PyArray;
+class PyDevice;
+class PyMemorySpace;
 struct PyArray_Storage;
 
-// Custom holder types.
-//
-// We must keep the PyClient object alive as long as any of the runtime
-// objects are alive. Since we don't have a lot of control over Python
-// destructor ordering, we keep the PyClient object as a std::shared_ptr<>,
-// and ensure that each Python runtime object holds a reference to the
-// PyClient. An alternative design would be to keep a single global
-// singleton PyClient, although this seems less flexible, especially for
-// writing tests.
-//
-// To maintain PyClient references, we define pybind11 holder classes that
-// are custom smart pointers that also keep a reference to a PyClient.
-// pybind11 has a `keep_alive` feature that has a similar goal, but it doesn't
-// seem sufficiently flexible to describe ownership relationships in cases where
-// the ownership doesn't pertain to a direct argument or return value of a
-// function. Another alternative to the holder classes would be to create proxy
-// objects that contain both a reference and a runtime class; holder classes
-// seem less tedious to define.
-
-// A pair of a PyClient reference and an unowned pointer to T.
-template <typename T>
-class ClientAndPtr {
- public:
-  ClientAndPtr() = default;
-  // pybind11 requires that we define a constructor that takes a raw pointer,
-  // but it should be unreachable.
-  explicit ClientAndPtr(T*) {
-    LOG(FATAL) << "ClientAndPtr should constructed via WrapWithClient.";
-  }
-
-  ClientAndPtr(const ClientAndPtr&) = default;
-  ClientAndPtr(ClientAndPtr&&) = default;
-  ClientAndPtr& operator=(const ClientAndPtr&) = default;
-  ClientAndPtr& operator=(ClientAndPtr&&) = default;
-
-  PyClient* get_client() const { return client_; }
-
-  std::shared_ptr<PyClient> client() const {
-    return std::shared_ptr<PyClient>(contents_, client_);
-  }
-
-  T* get() const { return contents_.get(); }
-  T* operator->() const { return contents_.get(); }
-  T& operator*() const { return *contents_; }
-
- private:
-  template <typename U>
-  friend ClientAndPtr<U> WrapWithClient(std::shared_ptr<PyClient> client,
-                                        U* contents);
-  std::shared_ptr<T> contents_;
-  PyClient* client_;
-};
-
-// By defining a templated helper function, we can use return type deduction
-// and avoid specifying types at the caller.
-template <typename T>
-ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyClient> client, T* contents) {
-  ClientAndPtr<T> result;
-  result.client_ = client.get();
-  result.contents_ = std::shared_ptr<T>(std::move(client), contents);
-  return result;
-}
-
 // Python wrapper around PjRtClient.
 // We use a wrapper class to add Python-specific functionality.
-class PyClient : public std::enable_shared_from_this<PyClient> {
+class PyClient {
  public:
+  static nb_class_ptr<PyClient> Make(std::shared_ptr<ifrt::Client> ifrt_client);
+
+  // Do not call the constructor directly. Use `PyClient::Make` instead.
   explicit PyClient(std::shared_ptr<ifrt::Client> ifrt_client);
   virtual ~PyClient();
 
@@ -140,7 +89,7 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
     return shared_ptr_pjrt_client();
   }
 
-  absl::string_view platform_name() const {
+  std::string_view platform_name() const {
     // TODO(phawkins): this is a temporary backwards compatibility shim. We
     // changed the name PJRT reports for GPU platforms to "cuda" or "rocm", but
     // we haven't yet updated JAX clients that expect "gpu". Migrate users and
@@ -152,12 +101,10 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
       return ifrt_client_->platform_name();
     }
   }
-  absl::string_view platform_version() const {
+  std::string_view platform_version() const {
     return ifrt_client_->platform_version();
   }
-  absl::string_view runtime_type() const {
-    return ifrt_client_->runtime_type();
-  }
+  std::string_view runtime_type() const { return ifrt_client_->runtime_type(); }
 
   // Returns implementation-specific attributes about this client, e.g. the PJRT
   // C API version if applicable.
@@ -172,43 +119,48 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   int device_count() const { return ifrt_client_->device_count(); }
   int process_index() const { return ifrt_client_->process_index(); }
 
-  std::vector<ClientAndPtr<PjRtDevice>> Devices();
-  std::vector<ClientAndPtr<PjRtDevice>> LocalDevices();
-  StatusOr<ClientAndPtr<PjRtDevice>> DeviceFromLocalHardwareId(
+  std::vector<nb_class_ptr<PyDevice>> Devices();
+  std::vector<nb_class_ptr<PyDevice>> LocalDevices();
+  absl::StatusOr<nb_class_ptr<PyDevice>> DeviceFromLocalHardwareId(
       int local_hardware_id);
 
+  // Returns the PyDevice associated with the given PjRtDevice.
+  nb_class_ptr<PyDevice> GetPyDevice(PjRtDevice* device);
+
+  // Returns the PyMemorySpace associated with the given PjRtMemorySpace.
+  nb_class_ptr<PyMemorySpace> GetPyMemorySpace(PjRtMemorySpace* memory_space);
+
   // Returns a vector of live PyArray objects. PyArray objects may share
   // PjRtBuffers, so there may be duplicates of the same underlying device
   // buffer.
-  std::vector<pybind11::object> LiveBuffersOnDevice(PjRtDevice* device);
+  std::vector<nanobind::object> LiveBuffersOnDevice(PjRtDevice* device);
 
-  // Returns a vector of live PyLoadedExecutable objects.
-  // note: must return std::shared_ptr instead of raw ptrs
-  // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#std-shared-ptr
-  std::vector<std::shared_ptr<PyLoadedExecutable>> LiveExecutables();
+  nanobind::list LiveExecutables();
 
   // TODO(zhangqiaorjc): Remove when we have transparent defragmentation.
-  Status Defragment();
+  absl::Status Defragment();
 
-  StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
-  MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
-                              PjRtDevice* device);
+  static absl::StatusOr<nanobind::list> MakeCrossHostReceiveBuffers(
+      nb_class_ptr<PyClient> client, absl::Span<const Shape> shapes,
+      PjRtDevice* device);
 
-  StatusOr<pybind11::object> BufferFromPyval(
-      pybind11::handle argument, PjRtDevice* device, bool force_copy,
+  static absl::StatusOr<nanobind::object> BufferFromPyval(
+      nb_class_ptr<PyClient> client, nanobind::handle argument,
+      PjRtDevice* device, bool force_copy,
       ifrt::Client::HostBufferSemantics host_buffer_semantics);
 
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> Compile(
-      std::string mlir_module, CompileOptions options,
-      std::vector<pybind11::capsule> host_callbacks);
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> Compile(
+      nb_class_ptr<PyClient> client, std::string mlir_module,
+      CompileOptions options, std::vector<nanobind::capsule> host_callbacks);
 
-  StatusOr<pybind11::bytes> SerializeExecutable(
+  absl::StatusOr<nanobind::bytes> SerializeExecutable(
       const PyLoadedExecutable& executable) const;
-  StatusOr<std::shared_ptr<PyLoadedExecutable>> DeserializeExecutable(
-      const std::string& serialized, std::optional<CompileOptions> options,
-      std::vector<pybind11::capsule> host_callbacks);
+  static absl::StatusOr<nb_class_ptr<PyLoadedExecutable>> DeserializeExecutable(
+      nb_class_ptr<PyClient> client, nanobind::bytes serialized,
+      std::optional<CompileOptions> options,
+      std::vector<nanobind::capsule> host_callbacks);
 
-  StatusOr<pybind11::bytes> HeapProfile();
+  absl::StatusOr<nanobind::bytes> HeapProfile();
 
   // `GetEmitPythonCallbackDescriptor` takes in an input Python callable that
   // takes in arguments of shapes `operand_shapes` and returns values of shapes
@@ -224,19 +176,19 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // either arrays or None values.
   // TODO(phawkins): pass operand_shapes and result_shapes as
   // absl::Span<Shape const> when nanobind transition is complete.
-  StatusOr<std::pair<uint64_t, pybind11::object>>
-  GetEmitPythonCallbackDescriptor(pybind11::function callable,
-                                  pybind11::object operand_shapes,
-                                  pybind11::object result_shapes);
+  absl::StatusOr<std::pair<uint64_t, nanobind::object>>
+  GetEmitPythonCallbackDescriptor(nanobind::callable callable,
+                                  nanobind::object operand_shapes,
+                                  nanobind::object result_shapes);
   // Deprecated; please switch to emitting a `CustomCallOp` directly.
-  StatusOr<XlaOp> EmitPythonCallbackFromDescriptor(
+  absl::StatusOr<XlaOp> EmitPythonCallbackFromDescriptor(
       XlaBuilder& builder, uint64_t descriptor,
       absl::Span<XlaOp const> operands, absl::Span<Shape const> result_shapes,
       std::optional<std::vector<Shape>> operand_layouts, bool has_side_effect);
   // Deprecated; please switch to using `GetEmitPythonCallbackDescriptor`
   // and then emitting a `CustomCall` op instead.
-  StatusOr<std::pair<XlaOp, pybind11::object>> EmitPythonCallback(
-      pybind11::function callable, XlaBuilder& builder,
+  absl::StatusOr<std::pair<XlaOp, nanobind::object>> EmitPythonCallback(
+      nanobind::callable callable, XlaBuilder& builder,
       absl::Span<XlaOp const> operands, absl::Span<Shape const> result_shapes,
       std::optional<std::vector<Shape>> operand_layouts, bool has_side_effect);
 
@@ -255,20 +207,29 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // The callable receives as arguments NumPy arrays for arguments with array
   // types, and None for Token argument. The callable must return a tuple of
   // either arrays or None values.
-  StatusOr<pybind11::object> MakePythonCallbackUsingHostSendAndRecv(
-      pybind11::function callable, absl::Span<Shape const> operand_shapes,
+  absl::StatusOr<nanobind::object> MakePythonCallbackUsingHostSendAndRecv(
+      nanobind::callable callable, absl::Span<Shape const> operand_shapes,
       absl::Span<Shape const> result_shapes,
       absl::Span<uint16_t const> send_channel_ids,
       absl::Span<uint16_t const> recv_channel_ids,
-      pybind11::function serializer);
+      nanobind::callable serializer);
 
   std::vector<nanobind::object> LiveArrays() const;
 
+  static void RegisterPythonTypes(nanobind::module_& m);
+
+ protected:
+  static void Initialize(nb_class_ptr<PyClient> client);
+
  private:
   friend class PyLoadedExecutable;
   friend class PyArray;
   friend struct PyArray_Storage;
 
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
   std::shared_ptr<ifrt::Client> ifrt_client_;
   absl::flat_hash_map<std::string, xla::ifrt::Client::ClientAttribute>
       client_attributes_;
@@ -278,10 +239,12 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
 
   PyLoadedExecutable* executables_ = nullptr;
   PyArray_Storage* arrays_ = nullptr;
+
+  absl::flat_hash_map<ifrt::Device*, nb_class_ptr<PyDevice>> devices_;
+  absl::flat_hash_map<PjRtMemorySpace*, nb_class_ptr<PyMemorySpace>>
+      memory_spaces_;
 };
 
 }  // namespace xla
 
-PYBIND11_DECLARE_HOLDER_TYPE(T, xla::ClientAndPtr<T>);
-
 #endif  // XLA_PYTHON_PY_CLIENT_H_
diff --git a/third_party/xla/xla/python/py_client_gpu.cc b/third_party/xla/xla/python/py_client_gpu.cc
index ca6232d7ccb9df..88496bec8b1206 100644
--- a/third_party/xla/xla/python/py_client_gpu.cc
+++ b/third_party/xla/xla/python/py_client_gpu.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #endif
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/host_callback.h"
 #include "xla/primitive_util.h"
 #include "xla/python/callback.h"
 #include "xla/python/nb_numpy.h"
@@ -97,8 +98,10 @@ void XlaPythonGpuCallback(gpuStreamHandle stream, void** buffers,
     array.attr("flags").attr("writeable") = nb::bool_(false);
     PyTuple_SET_ITEM(host_input_arrays.ptr(), i, array.inc_ref().ptr());
   }
+  EnterHostCallback();
   std::optional<nb::tuple> maybe_result_tuple =
       callback->Call(host_input_arrays, status);
+  LeaveHostCallback();
   if (!maybe_result_tuple) {
     return;
   }
diff --git a/third_party/xla/xla/python/py_compile_only_client.cc b/third_party/xla/xla/python/py_compile_only_client.cc
index c6d88e54045a8d..a85758f921096e 100644
--- a/third_party/xla/xla/python/py_compile_only_client.cc
+++ b/third_party/xla/xla/python/py_compile_only_client.cc
@@ -15,25 +15,58 @@ limitations under the License.
 
 #include "xla/python/py_compile_only_client.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
-#include "nanobind/nanobind.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "pybind11/stl.h"  // from @pybind11
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/literal.h"
 #include "xla/pjrt/mlir_to_hlo.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/pjrt/pjrt_future.h"
 #include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
 #include "xla/python/ifrt/device.h"
-#include "tsl/python/lib/core/numpy.h"  //NOLINT
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/service/computation_placer.h"
+#include "xla/tsl/python/lib/core/numpy.h"
+#include "xla/util.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace xla {
 
@@ -66,16 +99,16 @@ class PjRtCompileOnlyDevice : public PjRtDevice {
       absl::string_view description) const override {
     return nullptr;
   }
-  Status TransferToInfeed(const LiteralSlice& literal) override {
+  absl::Status TransferToInfeed(const LiteralSlice& literal) override {
     return Unimplemented("TransferToInfeed is not supported");
   }
-  Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
+  absl::Status TransferFromOutfeed(MutableBorrowingLiteral literal) override {
     return Unimplemented("TransferFromOutfeed is not supported");
   }
   absl::Span<PjRtMemorySpace* const> memory_spaces() const override {
     return {};
   }
-  StatusOr<PjRtMemorySpace*> default_memory_space() const override {
+  absl::StatusOr<PjRtMemorySpace*> default_memory_space() const override {
     return Unimplemented("default_memory_space is not supported");
   }
 
@@ -86,13 +119,14 @@ class PjRtCompileOnlyDevice : public PjRtDevice {
 class InvalidIfrtCompiler final
     : public llvm::RTTIExtends<InvalidIfrtCompiler, ifrt::Compiler> {
  public:
-  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
+  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
       std::unique_ptr<ifrt::Program> program,
       std::unique_ptr<ifrt::CompileOptions> options) override {
     return Unimplemented("Compile not implemented.");
   }
 
-  StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> DeserializeLoadedExecutable(
+  absl::StatusOr<std::unique_ptr<ifrt::LoadedExecutable>>
+  DeserializeLoadedExecutable(
       absl::string_view serialized,
       std::unique_ptr<ifrt::DeserializeExecutableOptions> options) override {
     return Unimplemented("DeserializeLoadedExecutable not implemented.");
@@ -116,7 +150,7 @@ class CompileOnlyIfRtClient final
     }
   }
 
-  StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
+  absl::StatusOr<tsl::RCReference<ifrt::Array>> MakeArrayFromHostBuffer(
       const void* data, ifrt::DType dtype, ifrt::Shape shape,
       std::optional<absl::Span<const int64_t>> byte_strides,
       std::shared_ptr<const ifrt::Sharding> sharding,
@@ -126,7 +160,8 @@ class CompileOnlyIfRtClient final
         "MakeArrayFromHostBuffer not available with compile-only client.");
   }
 
-  StatusOr<tsl::RCReference<ifrt::Array>> AssembleArrayFromSingleDeviceArrays(
+  absl::StatusOr<tsl::RCReference<ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
       ifrt::Shape shape, std::shared_ptr<const ifrt::Sharding> sharding,
       absl::Span<tsl::RCReference<ifrt::Array>> arrays,
       ifrt::ArrayCopySemantics semantics) override {
@@ -135,7 +170,7 @@ class CompileOnlyIfRtClient final
         "client.");
   }
 
-  StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
+  absl::StatusOr<tsl::RCReference<ifrt::Tuple>> MakeTuple(
       absl::Span<tsl::RCReference<ifrt::Value>> values) override {
     return Unimplemented("MakeTuple not available with compile-only client.");
   }
@@ -165,17 +200,17 @@ class CompileOnlyIfRtClient final
     return {};
   }
   int process_index() const override { return 0; }
-  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
       int num_replicas, int num_partitions) const override {
     return Unimplemented(
         "GetDefaultDeviceAssignment not available with compile-only client.");
   }
-  StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
+  absl::StatusOr<ifrt::Device*> LookupDevice(int device_id) const override {
     return Unimplemented(
         "LookupDevice not available with compile-only client.");
   }
 
-  StatusOr<ifrt::Device*> LookupAddressableDevice(
+  absl::StatusOr<ifrt::Device*> LookupAddressableDevice(
       int local_hardware_id) const override {
     return Unimplemented(
         "LookupAddressableDevice not available with compile-only client.");
@@ -187,7 +222,7 @@ class CompileOnlyIfRtClient final
 
   const PjRtTopologyDescription& topology() const { return *topology_; }
 
-  StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
+  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
   GetTopologyForDevices(
       absl::Span<ifrt::Device* const> devices) const override {
     return topology_;
@@ -207,15 +242,24 @@ class CompileOnlyPyClient : public PyClient {
  public:
   using PyClient::PyClient;
 
-  StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
-      std::string mlir_module, CompileOptions options,
-      std::vector<pybind11::capsule> host_callbacks) {
+  static nb_class_ptr<PyClient> Make(
+      std::shared_ptr<PjRtTopologyDescription> topology) {
+    auto client =
+        nb::borrow<nb_class_ptr<PyClient>>(make_nb_class<CompileOnlyPyClient>(
+            std::make_unique<CompileOnlyIfRtClient>(std::move(topology))));
+    CompileOnlyPyClient::Initialize(client);
+    return client;
+  }
+
+  absl::StatusOr<std::shared_ptr<PjRtExecutable>> CompileUnloaded(
+      std::string_view mlir_module, CompileOptions options,
+      std::vector<nb::capsule> host_callbacks) {
     if (!host_callbacks.empty()) {
       return Unimplemented(
           "Compiling with host_callbacks not available with compile-only "
           "client.");
     }
-    pybind11::gil_scoped_release gil_release;
+    nb::gil_scoped_release gil_release;
     mlir::MLIRContext context;
     TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                         ParseMlirModuleString(mlir_module, context));
@@ -226,42 +270,36 @@ class CompileOnlyPyClient : public PyClient {
     return PjRtCompile(std::move(options), module.get(),
                        ifrt_client->topology());
   }
+
+ private:
+  static void Initialize(nb_class_ptr<PyClient> client) {
+    PyClient::Initialize(client);
+  }
 };
 
 }  // namespace
 
-std::shared_ptr<PyClient> MakeCompileOnlyClient(
+nb_class_ptr<PyClient> MakeCompileOnlyClient(
     std::shared_ptr<PjRtTopologyDescription> topology) {
-  return std::make_shared<CompileOnlyPyClient>(
-      std::make_unique<CompileOnlyIfRtClient>(std::move(topology)));
+  return CompileOnlyPyClient::Make(std::move(topology));
 }
 
-void RegisterCompileOnlyClient(pybind11::module& m) {
-  pybind11::class_<CompileOnlyPyClient, PyClient,
-                   std::shared_ptr<CompileOnlyPyClient>>(m,
-                                                         "CompileOnlyPyClient")
+void RegisterCompileOnlyClient(nb::module_& m) {
+  nb::class_<CompileOnlyPyClient, PyClient>(m, "CompileOnlyPyClient")
       .def(
           "compile",
-          [](CompileOnlyPyClient& self, std::string mlir_module,
-             py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            // TODO(phawkins): just wrap CompileOnlyPyClient::CompileUnloaded
-            // directly when the nanobind transition is complete.
-            CompileOptions options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            return ValueOrThrow(
-                self.CompileUnloaded(mlir_module, options, host_callbacks));
+          [](CompileOnlyPyClient& self, nb::bytes mlir_module,
+             CompileOptions options, std::vector<nb::capsule> host_callbacks) {
+            return ValueOrThrow(self.CompileUnloaded(
+                std::string_view(mlir_module.c_str(), mlir_module.size()),
+                std::move(options), std::move(host_callbacks)));
           },
-          pybind11::arg("computation"),
-          pybind11::arg("compile_options") = py::none(),
-          pybind11::arg("host_callbacks") = std::vector<pybind11::capsule>());
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>())
+      .def(
+          "compile", ValueOrThrowWrapper(&CompileOnlyPyClient::CompileUnloaded),
+          nb::arg("computation"), nb::arg("compile_options") = CompileOptions(),
+          nb::arg("host_callbacks") = std::vector<nb::capsule>());
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/python/py_compile_only_client.h b/third_party/xla/xla/python/py_compile_only_client.h
index e2e93720bdf234..0501cc869a276d 100644
--- a/third_party/xla/xla/python/py_compile_only_client.h
+++ b/third_party/xla/xla/python/py_compile_only_client.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include <memory>
 
 // placeholder for index annotation headers
+#include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_client.h"
 
 namespace xla {
@@ -33,10 +35,10 @@ namespace xla {
 // Python duck typing to treat the unloaded executable like a loaded executable
 // (except it will raise errors if you try to run it, which is what we want for
 // AOT environments).
-std::shared_ptr<PyClient> MakeCompileOnlyClient(
+nb_class_ptr<PyClient> MakeCompileOnlyClient(
     std::shared_ptr<PjRtTopologyDescription>);
 
-void RegisterCompileOnlyClient(pybind11::module& m);
+void RegisterCompileOnlyClient(nanobind::module_& m);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/python/py_device.cc b/third_party/xla/xla/python/py_device.cc
new file mode 100644
index 00000000000000..bc947ae8f8848c
--- /dev/null
+++ b/third_party/xla/xla/python/py_device.cc
@@ -0,0 +1,321 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/py_device.h"
+
+#include <Python.h>
+
+#include <cstdint>
+#include <exception>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_join.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/nb_helpers.h"
+#include "xla/python/py_client.h"
+#include "xla/python/py_memory_space.h"
+#include "xla/python/python_ref_manager.h"
+#include "xla/python/types.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "tsl/framework/allocator.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace nb = ::nanobind;
+
+namespace xla {
+
+PyDevice::PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device)
+    : client_(std::move(client)), device_(device) {}
+
+int PyDevice::id() const { return device_->id(); }
+
+int PyDevice::process_index() const { return device_->process_index(); }
+
+std::string_view PyDevice::platform() const {
+  // TODO(phawkins): this is a temporary backwards
+  // compatibility shim. We changed the name PJRT
+  // reports for GPU platforms to "cuda" or "rocm",
+  // but we haven't yet updated JAX clients that
+  // expect "gpu". Migrate users and remove this
+  // code.
+  if (client_->platform_name() == "cuda" ||
+      client_->platform_name() == "rocm") {
+    return std::string_view("gpu");
+  } else {
+    return client_->platform_name();
+  }
+}
+
+std::string_view PyDevice::device_kind() const {
+  return device_->device_kind();
+}
+
+std::optional<int> PyDevice::local_hardware_id() const {
+  int local_hardware_id = device_->local_hardware_id();
+  if (local_hardware_id == -1) {
+    return std::nullopt;
+  }
+  return local_hardware_id;
+}
+
+std::string_view PyDevice::Str() const { return device_->DebugString(); }
+
+std::string_view PyDevice::Repr() const { return device_->ToString(); }
+
+absl::Status PyDevice::TransferToInfeed(LiteralSlice literal) {
+  GlobalPyRefManager()->CollectGarbage();
+  nb::gil_scoped_release gil_release;
+  return device_->TransferToInfeed(literal);
+}
+
+absl::StatusOr<nb::object> PyDevice::TransferFromOutfeed(Shape shape) {
+  GlobalPyRefManager()->CollectGarbage();
+  std::shared_ptr<Literal> literal;
+  {
+    nb::gil_scoped_release gil_release;
+    ShapeUtil::ForEachMutableSubshape(
+        &shape, [](Shape* subshape, const ShapeIndex&) {
+          if (!subshape->has_layout()) {
+            LayoutUtil::SetToDefaultLayout(subshape);
+          }
+        });
+    literal = std::make_shared<Literal>(shape);
+    TF_RETURN_IF_ERROR(device_->TransferFromOutfeed(literal.get()));
+  }
+  return LiteralToPython(std::move(literal));
+}
+
+absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::Memory(
+    std::string_view kind) const {
+  xla::PjRtMemorySpace* result_memory_space = nullptr;
+  for (auto* memory_space : device_->memory_spaces()) {
+    if (memory_space->memory_space_kind() == kind) {
+      if (result_memory_space != nullptr) {
+        std::string memories = absl::StrJoin(
+            device_->memory_spaces(), ", ",
+            [](std::string* out, const auto& memory_space) {
+              absl::StrAppend(out, memory_space->memory_space_kind());
+            });
+        auto device_kind = device_->device_kind();
+        return xla::InvalidArgument(
+            "Found more than one addressable memory for "
+            "kind %s which is not allowed. There can only "
+            "be one memory for each "
+            "kind. Device %s can address the following "
+            "memory kinds: %s",
+            kind, device_kind, memories);
+      }
+      result_memory_space = memory_space;
+    }
+  }
+  if (result_memory_space == nullptr) {
+    std::string memories =
+        absl::StrJoin(device_->memory_spaces(), ", ",
+                      [](std::string* out, const auto& memory_space) {
+                        absl::StrAppend(out, memory_space->memory_space_kind());
+                      });
+    auto device_kind = device_->device_kind();
+    return xla::InvalidArgument(
+        "Could not find memory addressable by device %s. Device %s "
+        "can address the following memory kinds: %s. "
+        "Got memory kind: %s",
+        device_kind, device_kind, memories, kind);
+  }
+  return client_->GetPyMemorySpace(result_memory_space);
+}
+
+absl::StatusOr<nb_class_ptr<PyMemorySpace>> PyDevice::DefaultMemory() const {
+  TF_ASSIGN_OR_RETURN(auto* memory_space, device_->default_memory_space());
+  return client_->GetPyMemorySpace(memory_space);
+}
+
+nb::list PyDevice::AddressableMemories() const {
+  nb::list memory_spaces;
+  for (auto* memory_space : device_->memory_spaces()) {
+    memory_spaces.append(client_->GetPyMemorySpace(memory_space));
+  }
+  return memory_spaces;
+}
+
+absl::StatusOr<std::optional<nb::dict>> PyDevice::MemoryStats() const {
+  GlobalPyRefManager()->CollectGarbage();
+  absl::StatusOr<tsl::AllocatorStats> maybe_stats =
+      device_->GetAllocatorStats();
+  if (absl::IsUnimplemented(maybe_stats.status())) {
+    return std::nullopt;
+  }
+  // Raise error if any status other than Unimplemented is returned.
+  ThrowIfError(maybe_stats.status());
+
+  nb::dict result;
+  result["num_allocs"] = maybe_stats->num_allocs;
+  result["bytes_in_use"] = maybe_stats->bytes_in_use;
+  result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
+  result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
+  if (maybe_stats->bytes_limit) {
+    result["bytes_limit"] = *maybe_stats->bytes_limit;
+  }
+  result["bytes_reserved"] = maybe_stats->bytes_reserved;
+  result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
+  if (maybe_stats->bytes_reservable_limit) {
+    result["bytes_reservable_limit"] = *maybe_stats->bytes_reservable_limit;
+  }
+  result["largest_free_block_bytes"] = maybe_stats->largest_free_block_bytes;
+  if (maybe_stats->pool_bytes) {
+    result["pool_bytes"] = *maybe_stats->pool_bytes;
+  }
+  if (maybe_stats->peak_pool_bytes) {
+    result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
+  }
+  return result;
+}
+
+absl::StatusOr<std::intptr_t> PyDevice::GetStreamForExternalReadyEvents()
+    const {
+  return device_->GetStreamForExternalReadyEvents();
+}
+
+/* static */ int PyDevice::tp_traverse(PyObject* self, visitproc visit,
+                                       void* arg) {
+  PyDevice* d = nb::inst_ptr<PyDevice>(self);
+  Py_VISIT(d->client().ptr());
+  return 0;
+}
+
+/* static */ int PyDevice::tp_clear(PyObject* self) {
+  PyDevice* d = nb::inst_ptr<PyDevice>(self);
+  nb_class_ptr<PyClient> client;
+  std::swap(client, d->client_);
+  return 0;
+}
+
+PyType_Slot PyDevice::slots_[] = {
+    {Py_tp_traverse, (void*)PyDevice::tp_traverse},
+    {Py_tp_clear, (void*)PyDevice::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyDevice::RegisterPythonType(nb::module_& m) {
+  nb::class_<PyDevice> device(
+      m, "Device", nb::type_slots(PyDevice::slots_),
+      "A descriptor of an available device.\n\nSubclasses are used to "
+      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
+      "have additional properties specific to that device type.");
+  device
+      .def_prop_ro(
+          "id", &PyDevice::id,
+          "Integer ID of this device.\n\nUnique across all available devices "
+          "of this type, including remote devices on multi-host platforms.")
+      .def_prop_ro("process_index", &PyDevice::process_index,
+                   "Integer index of this device's process.\n\n"
+                   "This is always 0 except on multi-process platforms.")
+      .def_prop_ro("host_id", &PyDevice::process_index,
+                   "Deprecated; please use process_index")
+      .def_prop_ro("task_id", &PyDevice::process_index,
+                   "Deprecated; please use process_index")
+      .def_prop_ro("platform", &PyDevice::platform)
+      .def_prop_ro("device_kind", &PyDevice::device_kind)
+      .def_prop_ro("client", &PyDevice::client)
+      .def_prop_ro(
+          "local_hardware_id", &PyDevice::local_hardware_id,
+          "Opaque hardware ID, e.g., the CUDA device number. In general, not "
+          "guaranteed to be dense, and not guaranteed to be defined on all "
+          "platforms.")
+      .def("__str__", &PyDevice::Str)
+      .def("__repr__", &PyDevice::Repr)
+      .def("transfer_to_infeed",
+           ThrowIfErrorWrapper(&PyDevice::TransferToInfeed))
+      .def("transfer_from_outfeed",
+           ValueOrThrowWrapper(&PyDevice::TransferFromOutfeed))
+      .def("memory", ValueOrThrowWrapper(&PyDevice::Memory), nb::arg("kind"))
+      .def("default_memory", ValueOrThrowWrapper(&PyDevice::DefaultMemory),
+           "Returns the default memory of a device.")
+      .def("addressable_memories", &PyDevice::AddressableMemories,
+           "Returns all the memories that a device can address.")
+
+      .def("live_buffers",
+           [](nb::handle device) {
+             PythonDeprecationWarning(
+                 "Per device live_buffers() is deprecated. Please "
+                 "use the jax.live_arrays() for jax.Arrays instead.");
+             return nb::list();
+           })
+      .def(
+          "memory_stats", ValueOrThrowWrapper(&PyDevice::MemoryStats),
+          "Returns memory statistics for this device keyed by name. May not "
+          "be implemented on all platforms, and different platforms may return "
+          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
+          "usually available. Intended for diagnostic use.")
+      .def(
+          "get_stream_for_external_ready_events",
+          xla::ValueOrThrowWrapper(&PyDevice::GetStreamForExternalReadyEvents));
+  static PyMethodDef get_attr_method = {
+      "__getattr__",
+      +[](PyObject* self, PyObject* args) -> PyObject* {
+        PyObject* key;
+        if (!PyArg_ParseTuple(args, "O", &key)) {
+          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
+          return nullptr;
+        }
+        try {
+          auto device = nb::cast<PyDevice*>(nb::handle(self));
+          auto name = nb::cast<std::string_view>(nb::handle(key));
+          const auto& attrs = device->device_->Attributes();
+          auto it = attrs.find(name);
+          if (it != attrs.end()) {
+            auto result =
+                std::visit([](auto&& v) { return nb::cast(v); }, it->second);
+            return result.release().ptr();
+          }
+          PyErr_SetNone(PyExc_AttributeError);
+          return nullptr;
+        } catch (std::exception& e) {
+          PyErr_Format(PyExc_SystemError, "Unhandled nanobind exception: %s",
+                       e.what());
+          return nullptr;
+        } catch (...) {
+          PyErr_SetString(PyExc_SystemError, "Unhandled nanobind exception.");
+          return nullptr;
+        }
+      },
+      METH_VARARGS,
+      nullptr,
+  };
+  device.attr("__getattr__") = nb::steal<nb::object>(PyDescr_NewMethod(
+      reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/py_device.h b/third_party/xla/xla/python/py_device.h
new file mode 100644
index 00000000000000..0576d55da325f0
--- /dev/null
+++ b/third_party/xla/xla/python/py_device.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_DEVICE_H_
+#define XLA_PYTHON_PY_DEVICE_H_
+
+#include <Python.h>
+
+#include <cstdint>
+#include <optional>
+#include <string_view>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/literal.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+class PyDevice {
+ public:
+  PyDevice(nb_class_ptr<PyClient> client, ifrt::Device* device);
+
+  // Devices are compared using Python object identity, so we don't allow them
+  // to be copied or moved.
+  PyDevice(const PyDevice&) = delete;
+  PyDevice(PyDevice&&) = delete;
+  PyDevice& operator=(const PyDevice&) = delete;
+  PyDevice& operator=(PyDevice&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  ifrt::Device* device() const { return device_; }
+
+  int id() const;
+  int process_index() const;
+  std::string_view platform() const;
+  std::string_view device_kind() const;
+  std::optional<int> local_hardware_id() const;
+
+  std::string_view Str() const;
+  std::string_view Repr() const;
+
+  absl::Status TransferToInfeed(LiteralSlice literal);
+  absl::StatusOr<nanobind::object> TransferFromOutfeed(Shape shape);
+
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> Memory(
+      std::string_view kind) const;
+  absl::StatusOr<nb_class_ptr<PyMemorySpace>> DefaultMemory() const;
+  nanobind::list AddressableMemories() const;
+  absl::StatusOr<std::optional<nanobind::dict>> MemoryStats() const;
+
+  absl::StatusOr<std::intptr_t> GetStreamForExternalReadyEvents() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  ifrt::Device* device_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_DEVICE_H_
diff --git a/third_party/xla/xla/python/py_device_list.cc b/third_party/xla/xla/python/py_device_list.cc
index b28b67f14178d7..3ce4eb19d34538 100644
--- a/third_party/xla/xla/python/py_device_list.cc
+++ b/third_party/xla/xla/python/py_device_list.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -31,12 +30,12 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/make_iterator.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/sharding.h"
 #include "xla/python/types.h"
@@ -45,9 +44,8 @@ limitations under the License.
 namespace jax {
 
 namespace nb = ::nanobind;
-namespace py = ::pybind11;
 
-PyDeviceList::PyDeviceList(std::shared_ptr<xla::PyClient> py_client,
+PyDeviceList::PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
                            xla::ifrt::DeviceList device_list)
     : py_client_(std::move(py_client)), device_list_(std::move(device_list)) {}
 
@@ -61,19 +59,19 @@ PyDeviceList::PyDeviceList(nb::tuple py_device_assignment)
   xla::ifrt::DeviceList::Devices devices;
   devices.reserve(py_device_assignment.size());
   for (nb::handle obj : py_device_assignment) {
-    if (!py::isinstance<xla::PjRtDevice>(obj.ptr())) {
-      // Non-`xla::PjRtDevice` is used on an alternative JAX backend with device
+    if (!nb::isinstance<xla::PyDevice>(obj.ptr())) {
+      // Non-`xla::PyDevice` is used on an alternative JAX backend with device
       // duck typing. Use Python device objects already set in `device_list_`.
       return;
     }
-    auto py_device = py::cast<xla::ClientAndPtr<xla::PjRtDevice>>(obj.ptr());
-    if (py_client_ == nullptr) {
-      py_client_ = py_device.client();
-    } else if (py_device.client() != py_client_) {
+    auto py_device = nb::cast<xla::PyDevice*>(obj);
+    if (py_client_.get() == nullptr) {
+      py_client_ = py_device->client();
+    } else if (py_device->client().get() != py_client_.get()) {
       // If the list contains multiple clients, fall back to device duck typing.
       return;
     }
-    devices.push_back(py_device.get());
+    devices.push_back(py_device->device());
   }
   device_list_ = xla::ifrt::DeviceList(std::move(devices));
 }
@@ -154,9 +152,7 @@ nb::object PyDeviceList::GetItem(int index) {
       } else if (index < 0) {
         index += device_list.size();
       }
-      py::object d =
-          py::cast(xla::WrapWithClient(py_client_, device_list[index]));
-      return nb::steal(d.release().ptr());
+      return py_client_->GetPyDevice(device_list[index]);
     }
     case 1:
       return std::get<1>(device_list_).attr("__getitem__")(index);
@@ -176,8 +172,7 @@ nb::object PyDeviceList::GetSlice(nb::slice slice) {
       }
       nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(slicelength));
       for (size_t i = 0; i < slicelength; ++i) {
-        py::object d =
-            py::cast(xla::WrapWithClient(py_client_, device_list[start]));
+        nb::object d = py_client_->GetPyDevice(device_list[start]);
         PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
         start += step;
       }
@@ -197,7 +192,7 @@ nb::tuple PyDeviceList::AsTuple() const {
       nb::tuple out = nb::steal<nb::tuple>(PyTuple_New(device_list.size()));
       int i = 0;
       for (xla::ifrt::Device* device : device_list) {
-        py::object d = py::cast(xla::WrapWithClient(py_client_, device));
+        nb::object d = py_client_->GetPyDevice(device);
         PyTuple_SET_ITEM(out.ptr(), i, d.release().ptr());
         ++i;
       }
@@ -218,18 +213,16 @@ nb::iterator PyDeviceList::Iter() {
       struct Iterator {
         void operator++() { ++it; }
         bool operator==(const Iterator& other) const { return it == other.it; }
-        xla::ClientAndPtr<xla::PjRtDevice> operator*() const {
-          return xla::WrapWithClient(py_client, *it);
+        xla::nb_class_ptr<xla::PyDevice> operator*() const {
+          return py_client->GetPyDevice(*it);
         }
-        const std::shared_ptr<xla::PyClient>& py_client;
+        xla::nb_class_ptr<xla::PyClient> py_client;
         xla::ifrt::DeviceList::Devices::const_iterator it;
       };
-      return nb::steal<nb::iterator>(
-          py::make_iterator(
-              Iterator{py_client_, std::get<0>(device_list_).begin()},
-              Iterator{py_client_, std::get<0>(device_list_).end()})
-              .release()
-              .ptr());
+      return nb::make_iterator(
+          nb::type<PyDeviceList>(), "ifrt_device_iterator",
+          Iterator{py_client_, std::get<0>(device_list_).begin()},
+          Iterator{py_client_, std::get<0>(device_list_).end()});
     }
     case 1:
       return nb::make_iterator(
diff --git a/third_party/xla/xla/python/py_device_list.h b/third_party/xla/xla/python/py_device_list.h
index 38c551bca4de4b..a569f3958c68f8 100644
--- a/third_party/xla/xla/python/py_device_list.h
+++ b/third_party/xla/xla/python/py_device_list.h
@@ -33,7 +33,7 @@ namespace jax {
 // Device list with various caching and direct access to IFRT DeviceList.
 class PyDeviceList {
  public:
-  PyDeviceList(std::shared_ptr<xla::PyClient> py_client,
+  PyDeviceList(xla::nb_class_ptr<xla::PyClient> py_client,
                xla::ifrt::DeviceList device_list);
   explicit PyDeviceList(nanobind::tuple py_device_assignment);
   ~PyDeviceList();
@@ -44,7 +44,7 @@ class PyDeviceList {
   PyDeviceList& operator=(PyDeviceList&&) = delete;
 
   // These two methods are safe to call from C++ without GIL.
-  std::shared_ptr<xla::PyClient> py_client() const { return py_client_; }
+  xla::nb_class_ptr<xla::PyClient> py_client() const { return py_client_; }
   absl::StatusOr<xla::ifrt::DeviceList> ifrt_device_list() const;
 
   // Methods below require GIL.
@@ -78,7 +78,7 @@ class PyDeviceList {
 
   // Valid only if `device_list_` contains `xla::ifrt::DeviceList` and
   // non-empty.
-  std::shared_ptr<xla::PyClient> py_client_;
+  xla::nb_class_ptr<xla::PyClient> py_client_;
 
   // Either C++ `ifrt::DeviceList` or Python duck-type devices.
   // TODO(hyeontaek): Remove support for Python duck-type devices once all
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index b494aaf7f98676..837a8ed5222c40 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -34,17 +34,19 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/layout.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_future.h"
+#include "xla/pjrt/pjrt_layout.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/future.h"
 #include "xla/python/ifrt/memory.h"
 #include "xla/python/ifrt/sharding.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/traceback.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/fingerprint.h"
@@ -74,7 +76,7 @@ absl::Status PyShardedToken::Await() {
 }
 
 PyLoadedExecutable::PyLoadedExecutable(
-    std::shared_ptr<PyClient> client,
+    nb_class_ptr<PyClient> client,
     std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
     std::optional<nb_traceback> traceback,
     std::optional<std::string> fingerprint)
@@ -111,12 +113,12 @@ PyLoadedExecutable::~PyLoadedExecutable() {
   }
 }
 
-std::vector<ClientAndPtr<PjRtDevice>> PyLoadedExecutable::AddressableDevices()
+std::vector<nb_class_ptr<PyDevice>> PyLoadedExecutable::AddressableDevices()
     const {
-  std::vector<ClientAndPtr<PjRtDevice>> devices;
+  std::vector<nb_class_ptr<PyDevice>> devices;
   devices.reserve(ifrt_loaded_executable_->addressable_devices().size());
   for (ifrt::Device* device : ifrt_loaded_executable_->addressable_devices()) {
-    devices.push_back(WrapWithClient(client_, device));
+    devices.push_back(client_->GetPyDevice(device));
   }
   return devices;
 }
@@ -175,7 +177,7 @@ struct ShardedBufferAdapter<ExecuteShardedArg> {
 };
 
 void PopulateExecuteShardedResults(
-    const std::shared_ptr<PyClient>& client,
+    const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
     const xla::PjRtFuture<absl::Status>& result_status, int num_computations,
     std::vector<std::vector<PyArray>>& outputs) {
@@ -199,7 +201,7 @@ void PopulateExecuteShardedResults(
 
 template <typename ArgT, typename ArgAdapter = ShardedBufferAdapter<ArgT>>
 absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
-    const ExecuteOptions& options, const std::shared_ptr<PyClient>& client,
+    const ExecuteOptions& options, const nb_class_ptr<PyClient>& client,
     ifrt::LoadedExecutable* ifrt_loaded_executable, absl::Span<const ArgT> args,
     std::optional<std::vector<PjRtFuture<absl::Status>>>& returned_futures,
     bool attach_status_to_results) {
@@ -253,7 +255,7 @@ absl::StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
 }  // namespace
 
 PyExecuteResults::PyExecuteResults(
-    const std::shared_ptr<PyClient>& client,
+    const nb_class_ptr<PyClient>& client,
     std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
     int num_computations, PyShardedToken token,
     xla::PjRtFuture<absl::Status> result_status)
@@ -405,14 +407,14 @@ PyLoadedExecutable::GetOutputMemoryKinds() const {
   return ifrt_loaded_executable_->GetOutputMemoryKinds();
 }
 
-absl::StatusOr<std::vector<Layout>> PyLoadedExecutable::GetParameterLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PyLoadedExecutable::GetParameterLayouts() const {
   nb::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetParameterLayouts();
 }
 
-absl::StatusOr<std::vector<Layout>> PyLoadedExecutable::GetOutputLayouts()
-    const {
+absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>>
+PyLoadedExecutable::GetOutputLayouts() const {
   nb::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetOutputLayouts();
 }
diff --git a/third_party/xla/xla/python/py_executable.h b/third_party/xla/xla/python/py_executable.h
index 5aed9320f0eacc..3139f65e4e1ce0 100644
--- a/third_party/xla/xla/python/py_executable.h
+++ b/third_party/xla/xla/python/py_executable.h
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_future.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/executable.h"
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/pjrt_ifrt/pjrt_executable.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_client.h"
@@ -87,7 +88,7 @@ class PyShardedToken {
 
 class PyExecuteResults {
  public:
-  PyExecuteResults(const std::shared_ptr<PyClient>& client,
+  PyExecuteResults(const nb_class_ptr<PyClient>& client,
                    std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays,
                    int num_computations, PyShardedToken token,
                    xla::PjRtFuture<absl::Status> result_status =
@@ -116,7 +117,7 @@ class PyExecuteResults {
  private:
   bool is_exploded_ = false;
   bool token_consumed_ = false;
-  std::shared_ptr<PyClient> client_;
+  nb_class_ptr<PyClient> client_;
   std::vector<tsl::RCReference<ifrt::Array>> ifrt_arrays_;
   int num_computations_;
   PyShardedToken token_;
@@ -129,17 +130,16 @@ using ExecuteShardedArg = std::variant<PyArray, std::vector<PyArray>>;
 // Python wrapper around PjRtExecutable. We use a wrapper class:
 // a) to keep the PyClient alive via a std::shared_ptr<>
 // b) to add Python-specific functionality.
-class PyLoadedExecutable
-    : public std::enable_shared_from_this<PyLoadedExecutable> {
+class PyLoadedExecutable {
  public:
   PyLoadedExecutable(
-      std::shared_ptr<PyClient> client,
+      nb_class_ptr<PyClient> client,
       std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
       std::optional<nb_traceback> traceback,
       std::optional<std::string> fingerprint);
   ~PyLoadedExecutable();
 
-  std::shared_ptr<PyClient> client() const { return client_; }
+  nb_class_ptr<PyClient> client() const { return client_; }
   ifrt::LoadedExecutable* ifrt_loaded_executable() const {
     return ifrt_loaded_executable_.get();
   }
@@ -149,7 +149,7 @@ class PyLoadedExecutable
     return ifrt_loaded_executable_->addressable_device_logical_ids();
   }
 
-  std::vector<ClientAndPtr<PjRtDevice>> AddressableDevices() const;
+  std::vector<nb_class_ptr<PyDevice>> AddressableDevices() const;
 
   int64_t SizeOfGeneratedCodeInBytes() const {
     return ifrt_loaded_executable_->SizeOfGeneratedCodeInBytes();
@@ -191,9 +191,11 @@ class PyLoadedExecutable
   absl::StatusOr<std::vector<std::vector<std::string_view>>>
   GetOutputMemoryKinds() const;
 
-  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>> GetParameterLayouts()
+      const;
 
-  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtLayout>>> GetOutputLayouts()
+      const;
 
   std::optional<std::vector<OpSharding>> GetParameterShardings() const;
 
@@ -235,7 +237,7 @@ class PyLoadedExecutable
  private:
   friend class PyClient;
 
-  std::shared_ptr<PyClient> client_;
+  nb_class_ptr<PyClient> client_;
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable_;
   std::optional<nb_traceback> traceback_;
 
diff --git a/third_party/xla/xla/python/py_memory_space.cc b/third_party/xla/xla/python/py_memory_space.cc
new file mode 100644
index 00000000000000..73aab577e544cc
--- /dev/null
+++ b/third_party/xla/xla/python/py_memory_space.cc
@@ -0,0 +1,107 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/python/py_memory_space.h"
+
+#include <Python.h>
+
+#include <string_view>
+#include <utility>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace nb = ::nanobind;
+
+namespace xla {
+
+PyMemorySpace::PyMemorySpace(nb_class_ptr<PyClient> client,
+                             PjRtMemorySpace* memory_space)
+    : client_(std::move(client)), memory_space_(memory_space) {}
+
+int PyMemorySpace::process_index() const { return client_->process_index(); }
+
+std::string_view PyMemorySpace::platform() const {
+  // TODO(phawkins): this is a temporary backwards
+  // compatibility shim. We changed the name PJRT
+  // reports for GPU platforms to "cuda" or "rocm",
+  // but we haven't yet updated JAX clients that
+  // expect "gpu". Migrate users and remove this
+  // code.
+  if (client_->platform_name() == "cuda" ||
+      client_->platform_name() == "rocm") {
+    return std::string_view("gpu");
+  } else {
+    return client_->platform_name();
+  }
+}
+
+std::string_view PyMemorySpace::kind() const {
+  return memory_space_->memory_space_kind();
+}
+
+std::string_view PyMemorySpace::Str() const {
+  return memory_space_->DebugString();
+}
+
+std::string_view PyMemorySpace::Repr() const {
+  return memory_space_->ToString();
+}
+
+nb::list PyMemorySpace::AddressableByDevices() const {
+  nb::list devices;
+  for (ifrt::Device* device : memory_space_->devices()) {
+    devices.append(client_->GetPyDevice(device));
+  }
+  return devices;
+}
+
+/* static */ int PyMemorySpace::tp_traverse(PyObject* self, visitproc visit,
+                                            void* arg) {
+  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
+  Py_VISIT(d->client().ptr());
+  return 0;
+}
+
+/* static */ int PyMemorySpace::tp_clear(PyObject* self) {
+  PyMemorySpace* d = nb::inst_ptr<PyMemorySpace>(self);
+  nb_class_ptr<PyClient> client;
+  std::swap(client, d->client_);
+  return 0;
+}
+
+PyType_Slot PyMemorySpace::slots_[] = {
+    {Py_tp_traverse, (void*)PyMemorySpace::tp_traverse},
+    {Py_tp_clear, (void*)PyMemorySpace::tp_clear},
+    {0, nullptr},
+};
+
+/* static */ void PyMemorySpace::RegisterPythonType(nb::module_& m) {
+  nb::class_<PyMemorySpace> device(m, "Memory",
+                                   nb::type_slots(PyMemorySpace::slots_));
+  device.def_prop_ro("process_index", &PyMemorySpace::process_index)
+      .def_prop_ro("platform", &PyMemorySpace::platform)
+      .def_prop_ro("kind", &PyMemorySpace::kind)
+      .def("__str__", &PyMemorySpace::Str)
+      .def("__repr__", &PyMemorySpace::Repr)
+      .def("addressable_by_devices", &PyMemorySpace::AddressableByDevices,
+           "Returns devices that can address this memory.");
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/python/py_memory_space.h b/third_party/xla/xla/python/py_memory_space.h
new file mode 100644
index 00000000000000..8605a6bd1d37f9
--- /dev/null
+++ b/third_party/xla/xla/python/py_memory_space.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PYTHON_PY_MEMORY_SPACE_H_
+#define XLA_PYTHON_PY_MEMORY_SPACE_H_
+
+#include <Python.h>
+
+#include <string_view>
+
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/nb_class_ptr.h"
+#include "xla/python/py_client.h"
+
+namespace xla {
+
+class PyMemorySpace {
+ public:
+  PyMemorySpace(nb_class_ptr<PyClient> client, PjRtMemorySpace* memory_space);
+
+  // Memory spaces are compared using Python object identity, so we don't allow
+  // them to be copied or moved.
+  PyMemorySpace(const PyMemorySpace&) = delete;
+  PyMemorySpace(PyMemorySpace&&) = delete;
+  PyMemorySpace& operator=(const PyMemorySpace&) = delete;
+  PyMemorySpace& operator=(PyMemorySpace&&) = delete;
+
+  const nb_class_ptr<PyClient>& client() const { return client_; }
+  PjRtMemorySpace* memory_space() const { return memory_space_; }
+
+  int process_index() const;
+  std::string_view platform() const;
+  std::string_view kind() const;
+
+  std::string_view Str() const;
+  std::string_view Repr() const;
+
+  nanobind::list AddressableByDevices() const;
+
+  static void RegisterPythonType(nanobind::module_& m);
+
+ private:
+  static int tp_traverse(PyObject* self, visitproc visit, void* arg);
+  static int tp_clear(PyObject* self);
+  static PyType_Slot slots_[];
+
+  nb_class_ptr<PyClient> client_;
+  PjRtMemorySpace* memory_space_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PY_MEMORY_SPACE_H_
diff --git a/third_party/xla/xla/python/py_values.cc b/third_party/xla/xla/python/py_values.cc
index 396dc542b11c1c..7454dd4e63bf93 100644
--- a/third_party/xla/xla/python/py_values.cc
+++ b/third_party/xla/xla/python/py_values.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/python/sharding.h"
 #include "xla/python/types.h"
 #include "xla/shape.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace nb = nanobind;
 
diff --git a/third_party/xla/xla/python/python_ref_manager.cc b/third_party/xla/xla/python/python_ref_manager.cc
index 427658e6f9e369..9e40ced1ed9cd6 100644
--- a/third_party/xla/xla/python/python_ref_manager.cc
+++ b/third_party/xla/xla/python/python_ref_manager.cc
@@ -29,7 +29,6 @@ limitations under the License.
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 PythonRefManager::ManagedPyObjects::ManagedPyObjects(
     PythonRefManager* manager, absl::Span<nb::object> objects)
@@ -46,24 +45,6 @@ PythonRefManager::ManagedPyObjects::~ManagedPyObjects() {
   }
 }
 
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReference(py::object object) {
-  nb::object o = nb::steal(object.release().ptr());
-  return std::make_shared<ManagedPyObjects>(this,
-                                            absl::Span<nb::object>(&o, 1));
-}
-
-std::shared_ptr<PythonRefManager::ManagedPyObjects>
-PythonRefManager::ManageReferences(absl::Span<py::object> objects) {
-  std::vector<nb::object> objects_to_manage;
-  objects_to_manage.reserve(objects.size());
-  for (py::object& object : objects) {
-    objects_to_manage.push_back(nb::steal(object.release().ptr()));
-  }
-  return std::make_shared<ManagedPyObjects>(this,
-                                            absl::MakeSpan(objects_to_manage));
-}
-
 std::shared_ptr<PythonRefManager::ManagedPyObjects>
 PythonRefManager::ManageReference(nb::object object) {
   return std::make_shared<ManagedPyObjects>(this,
@@ -91,22 +72,6 @@ void PythonRefManager::AddGarbage(absl::Span<nb::object> garbage) {
   }
 }
 
-void PythonRefManager::AddGarbage(py::object garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  python_garbage_.push_back(nb::steal(garbage.release().ptr()));
-}
-
-void PythonRefManager::AddGarbage(absl::Span<py::object> garbage) {
-  absl::MutexLock lock(&mu_);
-  // We want to collect arbitrary python garbage (e.g., buffers) aggressively.
-  garbage_count_.fetch_add(100, std::memory_order_relaxed);
-  for (py::object& o : garbage) {
-    python_garbage_.push_back(nb::steal(o.release().ptr()));
-  }
-}
-
 void PythonRefManager::AddGarbage(
     absl::Span<std::pair<PyCodeObject*, int> const> garbage) {
   absl::MutexLock lock(&mu_);
diff --git a/third_party/xla/xla/python/python_ref_manager.h b/third_party/xla/xla/python/python_ref_manager.h
index 27df2fe7494102..fd7b4cfbe12221 100644
--- a/third_party/xla/xla/python/python_ref_manager.h
+++ b/third_party/xla/xla/python/python_ref_manager.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_PYTHON_PYTHON_REF_MANAGER_H_
 #define XLA_PYTHON_PYTHON_REF_MANAGER_H_
 
+#include <Python.h>
+
 #include <atomic>
 #include <deque>
 #include <memory>
@@ -26,7 +28,6 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/pybind11.h"  // from @pybind11
 
 namespace xla {
 
@@ -43,7 +44,7 @@ class PythonRefManager {
  public:
   PythonRefManager() = default;
 
-  // Holds references to a set of pybind11::objects, adding the references to
+  // Holds references to a set of nanobind::objects, adding the references to
   // the PythonRefManager on destruction.
   class ManagedPyObjects {
    public:
@@ -66,9 +67,6 @@ class PythonRefManager {
   // Creates a managed std::shared_ptr to an object. When the shared_ptr is
   // destroyed, the reference to 'object' will be added to python_garbage_,
   // and collected next time CollectGarbage() is called.
-  std::shared_ptr<ManagedPyObjects> ManageReference(pybind11::object object);
-  std::shared_ptr<ManagedPyObjects> ManageReferences(
-      absl::Span<pybind11::object> objects);
   std::shared_ptr<ManagedPyObjects> ManageReference(nanobind::object object);
   std::shared_ptr<ManagedPyObjects> ManageReferences(
       absl::Span<nanobind::object> objects);
@@ -76,8 +74,6 @@ class PythonRefManager {
   // Adds garbage objects to the manager.
   void AddGarbage(nanobind::object garbage);
   void AddGarbage(absl::Span<nanobind::object> garbage);
-  void AddGarbage(pybind11::object garbage);
-  void AddGarbage(absl::Span<pybind11::object> garbage);
   void AddGarbage(absl::Span<std::pair<PyCodeObject*, int> const> garbage);
 
   // Releases the contents of python_garbage_. Requires that the GIL is held.
diff --git a/third_party/xla/xla/python/python_utils.h b/third_party/xla/xla/python/python_utils.h
deleted file mode 100644
index b71d5b2ab5db24..00000000000000
--- a/third_party/xla/xla/python/python_utils.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PYTHON_PYTHON_UTILS_H_
-#define XLA_PYTHON_PYTHON_UTILS_H_
-
-#include <Python.h>
-
-#include <optional>
-#include <string>
-
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "xla/status_macros.h"
-#include "xla/util.h"
-
-namespace jax {
-
-// This file contains utilities to write Python wrappers using the C API.
-// It's used for performance critical code such as PyArray, jax.jit or
-// jax.pmap.
-
-// Helpers for building Python properties
-template <typename Func>
-pybind11::object property_readonly(Func&& get) {
-  pybind11::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
-  return property(pybind11::cpp_function(std::forward<Func>(get)),
-                  pybind11::none(), pybind11::none(), "");
-}
-
-template <typename GetFunc, typename SetFunc>
-pybind11::object property(GetFunc&& get, SetFunc&& set) {
-  pybind11::handle property(reinterpret_cast<PyObject*>(&PyProperty_Type));
-  return property(pybind11::cpp_function(std::forward<GetFunc>(get)),
-                  pybind11::cpp_function(std::forward<SetFunc>(set)),
-                  pybind11::none(), "");
-}
-
-}  // namespace jax
-
-#endif  // XLA_PYTHON_PYTHON_UTILS_H_
diff --git a/third_party/xla/xla/python/pytree.cc b/third_party/xla/xla/python/pytree.cc
index edd43c8dd74a31..2d9dd09c3bb69f 100644
--- a/third_party/xla/xla/python/pytree.cc
+++ b/third_party/xla/xla/python/pytree.cc
@@ -18,7 +18,11 @@ limitations under the License.
 
 #include "xla/python/pytree.h"
 
+#include <Python.h>
+
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -93,6 +97,28 @@ void PyTreeRegistry::Register(nb::object type, nb::callable to_iterable,
   }
 }
 
+std::pair<nanobind::iterable, nanobind::object>
+PyTreeRegistry::Registration::ToIterable(nanobind::handle o) const {
+  nb::object out = to_iterable(o);
+  nb::tuple leaves_and_aux_data;
+  if (!nb::try_cast<nb::tuple>(out, leaves_and_aux_data) ||
+      leaves_and_aux_data.size() != 2) {
+    throw std::invalid_argument(absl::StrCat(
+        "The to_iterable function for a custom PyTree node should return "
+        "a (children, aux_data) tuple, got ",
+        nb::cast<std::string_view>(nb::repr(out))));
+  }
+  nb::iterable leaves;
+  if (!nb::try_cast<nb::iterable>(leaves_and_aux_data[0], leaves)) {
+    throw std::invalid_argument(absl::StrCat(
+        "The to_iterable function for a custom PyTree node should return "
+        "a (children, aux_data) tuple where 'children' is iterable, "
+        "got ",
+        nb::cast<std::string_view>(nb::repr(out))));
+  }
+  return std::make_pair(std::move(leaves), nb::object(leaves_and_aux_data[1]));
+}
+
 // Computes the node kind of a given Python object.
 PyTreeKind PyTreeRegistry::KindOfObject(
     nb::handle obj, PyTreeRegistry::Registration const** custom) const {
@@ -192,6 +218,45 @@ bool PyTreeDef::operator==(const PyTreeDef& other) const {
   return true;
 }
 
+nb::object PyTreeRegistry::FlattenOneLevel(nb::handle x) const {
+  PyTreeRegistry::Registration const* custom;
+  PyTreeKind kind = KindOfObject(x, &custom);
+  switch (kind) {
+    case PyTreeKind::kNone:
+      return nb::make_tuple(nb::make_tuple(), nb::none());
+    case PyTreeKind::kTuple:
+    case PyTreeKind::kList:
+      return nb::make_tuple(nb::borrow(x), nb::none());
+    case PyTreeKind::kDict: {
+      nb::dict dict = nb::borrow<nb::dict>(x);
+      std::vector<nb::object> sorted_keys = GetSortedPyDictKeys(dict.ptr());
+      nb::tuple keys = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
+      nb::tuple values = nb::steal<nb::tuple>(PyTuple_New(sorted_keys.size()));
+      for (size_t i = 0; i < sorted_keys.size(); ++i) {
+        PyTuple_SET_ITEM(values.ptr(), i,
+                         nb::object(dict[sorted_keys[i]]).release().ptr());
+        PyTuple_SET_ITEM(keys.ptr(), i, sorted_keys[i].release().ptr());
+      }
+      return nb::make_tuple(std::move(values), std::move(keys));
+    }
+    case PyTreeKind::kNamedTuple: {
+      nb::tuple in = nb::borrow<nb::tuple>(x);
+      nb::list out;
+      for (size_t i = 0; i < in.size(); ++i) {
+        out.append(in[i]);
+      }
+      return nb::make_tuple(std::move(out), x.type());
+    }
+    case PyTreeKind::kCustom: {
+      auto [leaves, aux_data] = custom->ToIterable(x);
+      return nb::make_tuple(std::move(leaves), std::move(aux_data));
+    }
+    default:
+      DCHECK(kind == PyTreeKind::kLeaf);
+      return nb::none();
+  }
+}
+
 template <typename T>
 void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
                             const std::optional<nb::callable>& leaf_predicate) {
@@ -257,14 +322,10 @@ void PyTreeDef::FlattenImpl(nb::handle handle, T& leaves,
         break;
       }
       case PyTreeKind::kCustom: {
-        nb::tuple out = nb::cast<nb::tuple>(node.custom->to_iterable(handle));
-        if (out.size() != 2) {
-          throw xla::XlaRuntimeError(
-              "PyTree custom to_iterable function should return a pair");
-        }
-        node.node_data = out[1];
+        auto [leaves, aux_data] = node.custom->ToIterable(handle);
+        node.node_data = std::move(aux_data);
         node.arity = 0;
-        for (nb::handle entry : nb::cast<nb::iterable>(out[0])) {
+        for (nb::handle entry : leaves) {
           ++node.arity;
           recurse(entry);
         }
@@ -558,20 +619,16 @@ nb::list PyTreeDef::FlattenUpTo(nb::handle xs) const {
               nb::cast<std::string_view>(nb::repr(node.custom->type)),
               nb::cast<std::string_view>(nb::repr(object))));
         }
-        nb::tuple out = nb::cast<nb::tuple>(node.custom->to_iterable(object));
-        if (out.size() != 2) {
-          throw xla::XlaRuntimeError(
-              "PyTree custom to_iterable function should return a pair");
-        }
-        if (node.node_data.not_equal(out[1])) {
+        auto [leaves, aux_data] = node.custom->ToIterable(object);
+        if (node.node_data.not_equal(aux_data)) {
           throw std::invalid_argument(absl::StrFormat(
               "Mismatch custom node data: %s != %s; value: %s.",
               nb::cast<std::string_view>(nb::repr(node.node_data)),
-              nb::cast<std::string_view>(nb::repr(out[1])),
+              nb::cast<std::string_view>(nb::repr(aux_data)),
               nb::cast<std::string_view>(nb::repr(object))));
         }
         int arity = 0;
-        for (nb::handle entry : nb::cast<nb::iterable>(out[0])) {
+        for (nb::handle entry : leaves) {
           ++arity;
           agenda.push_back(nb::borrow<nb::object>(entry));
         }
@@ -1104,6 +1161,8 @@ void BuildPytreeSubmodule(nb::module_& m) {
         return nb::make_tuple(std::move(leaves), std::move(def));
       },
       nb::arg("tree").none(), nb::arg("leaf_predicate").none() = std::nullopt);
+  registry.def("flatten_one_level", &PyTreeRegistry::FlattenOneLevel,
+               nb::arg("tree").none());
   registry.def("register_node", &PyTreeRegistry::Register);
   registry.def("__reduce__",
                [](nb::object self) { return self.attr("__name__"); });
diff --git a/third_party/xla/xla/python/pytree.h b/third_party/xla/xla/python/pytree.h
index 266af78b56c552..2cce9c5c3c7abc 100644
--- a/third_party/xla/xla/python/pytree.h
+++ b/third_party/xla/xla/python/pytree.h
@@ -19,9 +19,9 @@ limitations under the License.
 // See https://jax.readthedocs.io/en/latest/pytrees.html for the documentation
 // about pytree.
 
+#include <cstddef>
 #include <memory>
 #include <optional>
-#include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/hash/hash.h"
+#include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/pytree.pb.h"
@@ -67,6 +68,11 @@ class PyTreeRegistry : public std::enable_shared_from_this<PyTreeRegistry> {
     nanobind::callable to_iterable;
     // A function with signature: (aux_data, iterable) -> object
     nanobind::callable from_iterable;
+
+    // Helper that calls to_iterable and validates that it returns a pair
+    // of an iterable and an aux_data object
+    std::pair<nanobind::iterable, nanobind::object> ToIterable(
+        nanobind::handle o) const;
   };
 
   // Registers a new custom type. Objects of `type` will be treated as container
@@ -81,6 +87,10 @@ class PyTreeRegistry : public std::enable_shared_from_this<PyTreeRegistry> {
   PyTreeKind KindOfObject(nanobind::handle obj,
                           PyTreeRegistry::Registration const** custom) const;
 
+  // Flattens a pytree one level, returning either a tuple of the leaves and
+  // the node data, or None, if the entry is a leaf.
+  nanobind::object FlattenOneLevel(nanobind::handle x) const;
+
  private:
   struct TypeHash {
     using is_transparent = void;
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 465a3d3565b54d..dbf681c4d0be32 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -254,6 +254,10 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
 
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
+  // Efficiently remove dead code to avoid issues in subsequent passes.
+  // Too much dead code can cause e.g. the shape refinement pass to fail to
+  // converge.
+  pm.addPass(mlir::stablehlo::experimental::createStablehloTrivialDcePass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
   pm.addPass(mlir::stablehlo::experimental::createStablehloRefineShapesPass());
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
index 75e83084acb718..3ae6809b54e36e 100644
--- a/third_party/xla/xla/python/sharding.cc
+++ b/third_party/xla/xla/python/sharding.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <Python.h>
 
 #include <cstdlib>
-#include <memory>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -30,10 +29,7 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/pjrt_client.h"
-#include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
 #include "xla/python/nb_helpers.h"
@@ -41,7 +37,6 @@ limitations under the License.
 #include "xla/python/py_client.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/sharded_device_array.h"
-#include "xla/util.h"
 #include "tsl/platform/logging.h"
 
 namespace jax {
@@ -176,45 +171,6 @@ bool ShardingEqual(nb::handle a, nb::handle b) {
   return a.equal(b);
 }
 
-xla::ClientAndPtr<xla::PjRtMemorySpace> GetMemory(
-    const xla::ClientAndPtr<xla::PjRtDevice>& device, const std::string& kind) {
-  xla::PjRtMemorySpace* result_memory_space = nullptr;
-  for (auto* memory_space : device->memory_spaces()) {
-    if (memory_space->memory_space_kind() == kind) {
-      if (result_memory_space != nullptr) {
-        std::string memories = absl::StrJoin(
-            device->memory_spaces(), ", ",
-            [](std::string* out, const auto& memory_space) {
-              absl::StrAppend(out, memory_space->memory_space_kind());
-            });
-        auto device_kind = device->device_kind();
-        xla::ThrowIfError(
-            xla::InvalidArgument("Found more than one addressable memory for "
-                                 "kind %s which is not allowed. There can only "
-                                 "be one memory for each "
-                                 "kind. Device %s can address the following "
-                                 "memory kinds: %s",
-                                 kind, device_kind, memories));
-      }
-      result_memory_space = memory_space;
-    }
-  }
-  if (result_memory_space == nullptr) {
-    std::string memories =
-        absl::StrJoin(device->memory_spaces(), ", ",
-                      [](std::string* out, const auto& memory_space) {
-                        absl::StrAppend(out, memory_space->memory_space_kind());
-                      });
-    auto device_kind = device->device_kind();
-    xla::ThrowIfError(xla::InvalidArgument(
-        "Could not find memory addressable by device %s. Device %s "
-        "can address the following memory kinds: %s. "
-        "Got memory kind: %s",
-        device_kind, device_kind, memories, kind));
-  }
-  return WrapWithClient(device.client(), result_memory_space);
-}
-
 NamedSharding::NamedSharding(nb::object mesh, nb::object spec,
                              nb::object memory_kind, nb::object parsed_pspec,
                              nb::object manual_axes)
@@ -249,15 +205,10 @@ SingleDeviceSharding::SingleDeviceSharding(nb::object device,
 }
 
 SingleDeviceSharding::SingleDeviceSharding(
-    std::shared_ptr<xla::PyClient> client, xla::ifrt::DeviceList device_list,
+    xla::nb_class_ptr<xla::PyClient> client, xla::ifrt::DeviceList device_list,
     nb::object memory_kind)
     : XLACompatibleSharding(/*num_devices=*/1),
-      // TODO(phawkins): remove pybind11 translation when nanobind transition is
-      // complete.
-      device_(nb::steal<nb::object>(
-          pybind11::cast(WrapWithClient(client, device_list.front()))
-              .release()
-              .ptr())),
+      device_(client->GetPyDevice(device_list.front())),
       memory_kind_(std::move(memory_kind)),
       internal_device_list_(xla::make_nb_class<PyDeviceList>(
           std::move(client), std::move(device_list))) {
diff --git a/third_party/xla/xla/python/sharding.h b/third_party/xla/xla/python/sharding.h
index 96438c091b557b..600d2abf7103f4 100644
--- a/third_party/xla/xla/python/sharding.h
+++ b/third_party/xla/xla/python/sharding.h
@@ -17,16 +17,13 @@ limitations under the License.
 #define XLA_PYTHON_SHARDING_H_
 
 #include <cstddef>
-#include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 
 // placeholder for index annotation headers
 #include "absl/hash/hash.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/status_casters.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/nb_class_ptr.h"
@@ -69,9 +66,6 @@ size_t ShardingHash(nanobind::handle sharding);
 
 bool ShardingEqual(nanobind::handle a, nanobind::handle b);
 
-xla::ClientAndPtr<xla::PjRtMemorySpace> GetMemory(
-    const xla::ClientAndPtr<xla::PjRtDevice>& device, const std::string& kind);
-
 class XLACompatibleSharding : public Sharding {
  public:
   using Sharding::Sharding;
@@ -118,7 +112,7 @@ class SingleDeviceSharding : public XLACompatibleSharding {
       nanobind::object device, nanobind::object memory_kind = nanobind::none());
 
   // Used only in C++ to accelerate `PyArray::MakeFromSingleDeviceArray()`.
-  SingleDeviceSharding(std::shared_ptr<xla::PyClient> client,
+  SingleDeviceSharding(xla::nb_class_ptr<xla::PyClient> client,
                        xla::ifrt::DeviceList device_list,
                        nanobind::object memory_kind);
 
diff --git a/third_party/xla/xla/python/tools/BUILD b/third_party/xla/xla/python/tools/BUILD
new file mode 100644
index 00000000000000..2b6ce41ba112b0
--- /dev/null
+++ b/third_party/xla/xla/python/tools/BUILD
@@ -0,0 +1,94 @@
+load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+
+# NOTE: We can't use `pytype_pybind_extension` nor `pytype_strict_contrib_test`
+# because the OSS versions of these files do not include ports of those rules.
+# We must instead use `tsl_pybind_extension` and `py_strict_test`.
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+load("//xla:strict.default.bzl", "py_strict_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+exports_files([
+    "__init__.py",
+    "types.py",
+    "_types.pyi",
+])
+
+# NOTE: This wrapper library is necessary in order to capture the Python
+# dependencies of our extension (namely `ml_dtypes`).  Although the
+# underlying `pybind_extension` rule has a `py_deps` argument for capturing
+# such dependencies directly, the `tsl_pybind_extension` rule doesn't expose
+# that `py_deps` argument for us to use.
+#
+# NOTE: On the OSS side, the `pytype_strict_library` rule is changed into
+# the non-typed rule, which in turn causes an error about the `pytype_srcs`
+# field.  The "..:xla_client" target gets around this by adding a custom
+# copybara rule; but in lieu of adding yet another custom rule to maintain,
+# we just use the generic copybara mechanism for commenting the field out
+# on the OSS side.
+# TODO(wrengr,phawkins): Once cl/619904840 lands, we can remove the
+# pragma and the preceding commentary.
+pytype_strict_library(
+    name = "types",
+    srcs = ["types.py"],
+    # copybara:uncomment pytype_srcs = ["_types.pyi"],
+    srcs_version = "PY3",
+    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
+    # dependency isn't part of the public API.
+    tags = ["no_oss"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":_types",  # buildcleaner: keep
+        "//third_party/py/numpy",
+        "//xla:xla_data_proto_py_pb2",
+        "@ml_dtypes",
+    ],
+)
+
+# NOTE: Copybara detects the `tsl_pybind_extension` rule and automatically
+# injects the "@com_google_protobuf//:protobuf_python" python dependency
+# required by "@pybind11_protobuf//pybind11_protobuf:native_proto_caster".
+tsl_pybind_extension(
+    name = "_types",
+    srcs = ["_types.cc"],
+    pytype_deps = ["//third_party/py/numpy"],
+    pytype_srcs = ["_types.pyi"],
+    # Users should depend on ":types" instead.
+    visibility = ["//visibility:private"],
+    deps = [
+        "//third_party/nanobind",
+        "//xla:literal",
+        "//xla:xla_data_proto_cc",
+        "//xla/python:logging",
+        "//xla/python:nb_numpy",
+        "//xla/python:types",
+        "//xla/tsl/python/lib/core:numpy",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
+
+py_strict_test(
+    name = "types_test",
+    size = "small",
+    srcs = ["types_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    # Cannot build this on OSS because the ":xla_data_proto_py_pb2"
+    # dependency isn't part of the public API.
+    tags = ["no_oss"],
+    deps = [
+        ":types",
+        #internal proto upb dep
+        "//third_party/py/numpy",
+        "//xla:xla_data_proto_py_pb2",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/third_party/xla/xla/python/tools/__init__.py b/third_party/xla/xla/python/tools/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/xla/python/tools/_types.cc b/third_party/xla/xla/python/tools/_types.cc
new file mode 100644
index 00000000000000..cd2a65382642c9
--- /dev/null
+++ b/third_party/xla/xla/python/tools/_types.cc
@@ -0,0 +1,159 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "third_party/nanobind/include/nanobind/nanobind.h"
+#include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11/numpy.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "xla/literal.h"
+#include "xla/python/logging.h"
+#include "xla/python/nb_numpy.h"
+#include "xla/python/types.h"
+#include "xla/xla_data.pb.h"
+// NOTE: The tsl-numpy header forbids importing the actual NumPy arrayobject.h
+// header before tsl-numpy (whereas, importing pybind11-numpy before tsl-numpy
+// is fine); however, tsl-numpy does reexport NumPy's arrayobject.h header.
+// Since one of the TF headers above already includes tsl-numpy, therefore
+// we must include it down here rather than including actual NumPy directly.
+#include "xla/tsl/python/lib/core/numpy.h"
+
+namespace py = ::pybind11;
+namespace nb = ::nanobind;
+
+namespace {
+absl::StatusOr<py::object> MakeNdarray(const xla::LiteralProto& proto) {
+  auto m_lit = xla::Literal::CreateFromProto(proto);
+  if (!m_lit.ok()) {
+    // NOTE: The OSS version of XLA is still using an old version of
+    // Abseil (LTS branch, Aug 2023, Patch 1) which does not have the
+    // `AbslStringify` interface for implicitly converting `absl::Status`
+    // into the `absl::AlphaNum` required by `absl::StrCat`.  Therefore we
+    // inline the latest definition of the `AbslStringify` overload.
+    throw py::value_error(absl::StrCat(
+        "Cannot `xla::Literal::CreateFromProto`: ",
+        m_lit.status().ToString(absl::StatusToStringMode::kWithEverything)));
+  }
+
+  // Move (not copy) the literal onto the heap, for sharing with Python.
+  auto lit = std::make_shared<xla::Literal>(std::move(m_lit).value());
+
+  TF_ASSIGN_OR_RETURN(auto nbobj, xla::LiteralToPython(std::move(lit)));
+
+  // Convert `nb::object` into `py::object`.
+  return py::reinterpret_steal<py::object>(nbobj.release().ptr());
+}
+
+// Partial reversion of cl/617156835, until we can get the proto-casters
+// (and hence the extension) switched over to nanobind.
+// TODO(wrengr): Or can we mix `{py,nb}::module_::def` calls??
+absl::StatusOr<xla::PrimitiveType> DtypeToEtype(const py::dtype& py_d) {
+  auto nb_d = nb::borrow<xla::nb_dtype>(py_d.ptr());
+  return xla::DtypeToPrimitiveType(nb_d);
+}
+
+absl::StatusOr<py::dtype> EtypeToDtype(xla::PrimitiveType p) {
+  TF_ASSIGN_OR_RETURN(xla::nb_dtype nb_d, xla::PrimitiveTypeToNbDtype(p));
+  return py::reinterpret_steal<py::dtype>(nb_d.release().ptr());
+}
+}  // namespace
+
+// NOTE: It seems insurmountable to get "native_proto_caster.h" to work
+// with nanobind modules; therefore, we define our extension as a pybind11
+// module so that we can use `pybind11::module_::def`.
+PYBIND11_MODULE(_types, py_m) {
+  // Initialize ABSL logging because code within XLA uses it.
+  // (As per `xla::Init` in "xla.cc"; though we don't need it ourselves.)
+#ifndef PLATFORM_GOOGLE
+  xla::InitializeAbslLogging();
+#endif  // PLATFORM_GOOGLE
+
+  // Normally this would happen at the start of NB_MODULE, but since
+  // this is a pybind11 module we have to do this ourselves.
+  // (As per `xla::Init` in "xla.cc".)
+  nb::detail::init(NB_DOMAIN_STR);
+
+  // Import implicit conversions from Python protobuf objects to C++
+  // protobuf objects.
+  pybind11_protobuf::ImportNativeProtoCasters();
+
+  // Import implicit conversions from `absl::StatusOr` to Python exceptions.
+  // (The code for performing conversions is easy enough to port to nanobind;
+  // albeit, the conversion calls themselves have to be made explicit,
+  // since `nb::detail::type_caster` disallows raising exceptions.)
+  py::google::ImportStatusModule();
+
+  // Import the 'ml_dtypes' module; which is implicitly required by
+  // `xla::LiteralToPython`.
+  // NOTE: If the `tsl_pybind_extension` build rule allowed us to specify
+  // this as a py_dep, then importing the module here would mean that
+  // client Python code need not import the hidden dependency themselves.
+  // However, since `tsl_pybind_extension` does not allow specifying py_deps,
+  // if client rules do not themselves declare the dependency then this will
+  // generate a `ModuleNotFoundError` / `ImportError` exception.  Hence why
+  // we define the "types.py" wrapper library to encapsulate the dependency.
+  py::module_::import("ml_dtypes");
+
+  // Ensure that tsl-numpy initializes datastructures of the actual-NumPy
+  // implementation, and does whatever else tsl-numpy needs.  This is
+  // also necessary for using the `xla::nb_dtype` type.
+  tsl::ImportNumpy();
+
+  // Declare that C++ can `nb::cast` from `std::shared_ptr<xla::Literal>`
+  // to `nb::object`; which is implicitly required by `xla::LiteralToPython`.
+  // (FWIW: This also enables using `nb::type<xla::Literal>()` to get
+  // the Python-type-object associated with the C++ class.)
+  //
+  // NOTE: This does *not* mean that C++ can `py::cast` from `xla::Literal`
+  // to `py::object`.  It's unclear whether we can simultaneously provide
+  // both nanobind and pybind11 bindings (if we wanted the latter).
+  nb::module_ nb_m = nb::cast<nb::module_>(nb::borrow(py_m.ptr()));
+  nb::class_<xla::Literal>(nb_m, "Literal")
+      .def("__repr__", &xla::Literal::ToString);
+
+  // We do not define `py_m.doc()` here, since it wouldn't be inherited
+  // by the "types.py" wrapper library.  See there for the python docstring.
+
+  // LINT.IfChange
+  py_m.def("make_ndarray", &MakeNdarray, py::arg("proto").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `tensorflow.compiler.xla.xla_data_pb2.LiteralProto`
+    into an `xla::Literal` and then converts that literal into a tree
+    of tuples with leaves being `numpy.ndarray` views of array-shaped
+    sub-literals.
+  )pbdoc");
+
+  // This method name is based on `xla_client.dtype_to_etype`.
+  // NOTE: `xla_client` uses a Python class wrapping the protobuf-enum,
+  // rather than using the protobuf-enum directly.  See the module docstring
+  // in "types.py" for more explanation on why.
+  py_m.def("dtype_to_etype", &DtypeToEtype, py::arg("dtype").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `numpy.dtype` into
+    `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType`.
+  )pbdoc");
+
+  py_m.def("etype_to_dtype", &EtypeToDtype, py::arg("ptype").none(false),
+           py::pos_only(), R"pbdoc(
+    Converts `tensorflow.compiler.xla.xla_data_pb2.PrimitiveType` into
+    `numpy.dtype`.
+  )pbdoc");
+  // LINT.ThenChange(_types.pyi)
+}
diff --git a/third_party/xla/xla/python/tools/_types.pyi b/third_party/xla/xla/python/tools/_types.pyi
new file mode 100644
index 00000000000000..f355656f05b674
--- /dev/null
+++ b/third_party/xla/xla/python/tools/_types.pyi
@@ -0,0 +1,25 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Union
+import numpy as np
+from xla import xla_data_pb2
+
+# LINT.IfChange
+NdarrayTree = Union[np.ndarray, tuple['NdarrayTree', ...]]
+def make_ndarray(proto: xla_data_pb2.LiteralProto, /) -> NdarrayTree: ...
+def dtype_to_etype(dtype: np.dtype, /) -> xla_data_pb2.PrimitiveType: ...
+def etype_to_dtype(ptype: xla_data_pb2.PrimitiveType, /) -> np.dtype: ...
+# LINT.ThenChange(types.py, _types.cc)
diff --git a/third_party/xla/xla/python/tools/types.py b/third_party/xla/xla/python/tools/types.py
new file mode 100644
index 00000000000000..189758f1e749c8
--- /dev/null
+++ b/third_party/xla/xla/python/tools/types.py
@@ -0,0 +1,53 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tensorflow.compiler.xla.python.tools.types.
+
+This module provides Python bindings for various functions in
+'tensorflow/compiler/xla/python/types.h'.  It is primarily intended
+to assist internal users in debugging things; and is not considered
+part of the public API for OpenXLA.
+
+NOTE: This module *does* depend on Python protocol buffers; so beware!
+The XLA Python bindings are currently packaged both as part of jaxlib and
+as part of TensorFlow.  Therefore, since we use protocol buffers here,
+importing both jaxlib and TensorFlow may fail with duplicate protocol
+buffer message definitions.
+"""
+
+from typing import Union
+# NOTE: `ml_dtypes` is implicitly required by `xla::LiteralToPython`.
+# The entire goal of this wrapper library is to capture this dependency,
+# so that client code need not be aware of it.
+import ml_dtypes  # pylint: disable=unused-import
+import numpy
+# NOTE: These protos are not part of TensorFlow's public API, therefore
+# we cannot abide by [g-direct-tensorflow-import].
+# pylint: disable=g-direct-tensorflow-import,unused-import
+from local_xla.xla import xla_data_pb2
+# pylint: enable=g-direct-tensorflow-import,unused-import
+
+# NOTE: `import <name> as <name>` is required for names to be exported.
+# See PEP 484 & <https://github.com/google/jax/issues/7570>
+# pylint: disable=g-importing-member,useless-import-alias,unused-import,g-multiple-import
+# LINT.IfChange
+from ._types import (
+    make_ndarray as make_ndarray,
+    dtype_to_etype as dtype_to_etype,
+    etype_to_dtype as etype_to_dtype,
+)
+# TODO(wrengr): We can't import the `NdarrayTree` defined in the pyi file.
+# So re-defining it here for now.
+NdarrayTree = Union[numpy.ndarray, tuple['NdarrayTree', ...]]
+# LINT.ThenChange(_types.pyi)
diff --git a/third_party/xla/xla/python/tools/types_test.py b/third_party/xla/xla/python/tools/types_test.py
new file mode 100644
index 00000000000000..a6cdb1d0f76b13
--- /dev/null
+++ b/third_party/xla/xla/python/tools/types_test.py
@@ -0,0 +1,181 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import itertools
+import math
+import re
+from typing import List, NamedTuple
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as np
+
+# NOTE: These protos are not part of the public API, therefore we cannot
+# abide by [g-direct-tensorflow-import].
+# pylint: disable=g-direct-tensorflow-import
+from local_xla.xla import xla_data_pb2
+from xla.python.tools import types
+# pylint: enable=g-direct-tensorflow-import
+
+
+class MakeNdarrayInvalidTest(absltest.TestCase):
+  """Tests for invalid/unsupported arguments to `make_ndarray`."""
+
+  def setUp(self):
+    super().setUp()
+    self.assert_cannot_create_from_proto = self.assertRaisesRegex(
+        ValueError, re.escape('Cannot `xla::Literal::CreateFromProto`')
+    )
+
+  # NOTE: The `Literal(const Shape&, bool, ArrayValueState)` ctor does
+  # a CHECK forbidding `element_size_in_bits` from being specified;
+  # so we can't test anything about custom sizes here.
+
+  def testMissingLayout(self):
+    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
+    # Though in principle it could use a default ctor instead, like we
+    # do in `make_named_parameter` below`.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testMissingMinorToMajor(self):
+    # NOTE: `CreateFromProto` requires explicit `shape.layout.minor_to_major`.
+    # Though in principle it could use a default ctor instead, like we
+    # do in `make_named_parameter` below`.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+            layout=xla_data_pb2.LayoutProto(),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testInvalidPrimitiveType(self):
+    # NOTE: The `is_dynamic_dimension` field isn't required by
+    # `CreateFromProto`; however, the `Shape(const ShapeProto&)` ctor
+    # will log warnings if we leave it unspecified.
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.PRIMITIVE_TYPE_INVALID,
+            dimensions=[1, 2, 3],
+            is_dynamic_dimension=[False, False, False],
+            layout=xla_data_pb2.LayoutProto(
+                minor_to_major=[0, 1, 2],
+            ),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+  def testHasDimLevelTypes(self):
+    # NOTE: `CreateFromProto` forbids `dim_level_types` (even if all-dense).
+    pb = xla_data_pb2.LiteralProto(
+        shape=xla_data_pb2.ShapeProto(
+            element_type=xla_data_pb2.PrimitiveType.F64,
+            dimensions=[1, 2, 3],
+            is_dynamic_dimension=[False, False, False],
+            layout=xla_data_pb2.LayoutProto(
+                dim_level_types=[
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                    xla_data_pb2.DimLevelType.DIM_DENSE,
+                ],
+                minor_to_major=[0, 1, 2],
+            ),
+        )
+    )
+    with self.assert_cannot_create_from_proto:
+      types.make_ndarray(pb)
+
+
+class MakeNdarrayValidTestParameter(NamedTuple):
+  testcase_name: str
+  proto: xla_data_pb2.LiteralProto
+  arr: np.ndarray
+
+
+def make_named_parameter(
+    testcase_name: str,
+    dimensions: List[int],
+    data: List[float],
+) -> MakeNdarrayValidTestParameter:
+  """Helper function to construct parameters for `MakeNdarrayValidTest`."""
+  assert math.prod(dimensions) == len(data)
+  nd = len(dimensions)
+  proto = xla_data_pb2.LiteralProto(
+      shape=xla_data_pb2.ShapeProto(
+          element_type=xla_data_pb2.PrimitiveType.F64,
+          dimensions=dimensions,
+          is_dynamic_dimension=itertools.repeat(False, nd),
+          layout=xla_data_pb2.LayoutProto(
+              minor_to_major=range(nd),
+          ),
+      ),
+      f64s=data,
+  )
+  arr = types.make_ndarray(proto)
+  return MakeNdarrayValidTestParameter(testcase_name, proto, arr)
+
+
+@parameterized.named_parameters(
+    make_named_parameter('A', [2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
+    make_named_parameter('B', [1, 2, 3], [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]),
+    make_named_parameter('C', [2, 3], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
+    make_named_parameter('D', [3, 2], [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]),
+)
+class MakeNdarrayValidTest(parameterized.TestCase):
+  """Correctness tests for valid arguments to `make_ndarray`."""
+
+  def testHasCorrectDtype(self, proto, arr):
+    """Test that the result has the right dtype."""
+    e = proto.shape.element_type
+    d = arr.dtype
+    with self.subTest(msg='etype_to_dtype'):
+      self.assertEqual(types.etype_to_dtype(e), d)
+    with self.subTest(msg='dtype_to_etype'):
+      self.assertEqual(e, types.dtype_to_etype(d))
+
+  def testHasCorrectRank(self, proto, arr):
+    """Test that the result has the right rank."""
+    self.assertLen(proto.shape.dimensions, arr.ndim)
+
+  def testHasCorrectShape(self, proto, arr):
+    """Test that the result has the same/right shape."""
+    self.assertTupleEqual(tuple(proto.shape.dimensions), arr.shape)
+
+  def testHasCorrectData(self, proto, arr):
+    """Test that the result has the same/right data."""
+    # TODO(wrengr): Figure out a way to abstract away the name of the
+    # proto field containing the data; so that we can test multiple types.
+    self.assertSequenceAlmostEqual(proto.f64s, list(np.nditer(arr)))
+
+  # TODO(wrengr): Add tests for:
+  # * dynamic dimension sizes.
+  # * non-trivial `minor_to_major`.
+  # * problematic types {PRED,F16,C64,C128} are all handled correctly.
+  # * BF16 is handled correctly.
+  # * tuples are handled correctly
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/third_party/xla/xla/python/types.cc b/third_party/xla/xla/python/types.cc
index 0c7c17472982c5..5ecb181fb2592e 100644
--- a/third_party/xla/xla/python/types.cc
+++ b/third_party/xla/xla/python/types.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/exceptions.h"
@@ -47,16 +45,15 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/python/lib/core/numpy.h"
 
 namespace xla {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 namespace {
 
@@ -154,10 +151,6 @@ absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const nb_dtype& np_type) {
                          np_type.char_(), np_type.kind(), np_type.itemsize());
 }
 
-absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
-  return DtypeToPrimitiveType(nb::borrow<nb_dtype>(np_type.ptr()));
-}
-
 absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type) {
   const CustomDtypes& custom_dtypes = GetCustomDtypes();
   auto to_nb_dtype = [](int typenum) -> nb_dtype {
@@ -215,11 +208,6 @@ absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type) {
   }
 }
 
-absl::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
-  TF_ASSIGN_OR_RETURN(nb_dtype np_type, PrimitiveTypeToNbDtype(type));
-  return py::reinterpret_steal<py::dtype>(np_type.release().ptr());
-}
-
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype) {
   const CustomDtypes& custom_dtypes = GetCustomDtypes();
   auto to_nb_dtype = [](int typenum) -> nb_dtype {
@@ -285,11 +273,6 @@ absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype) {
   }
 }
 
-absl::StatusOr<pybind11::dtype> IfrtDtypeToDtype(ifrt::DType dtype) {
-  TF_ASSIGN_OR_RETURN(nb_dtype np_type, IfrtDtypeToNbDtype(dtype));
-  return py::reinterpret_steal<py::dtype>(np_type.release().ptr());
-}
-
 absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype) {
   TF_ASSIGN_OR_RETURN(auto primitive_type, DtypeToPrimitiveType(dtype));
   return ifrt::ToDType(primitive_type);
@@ -499,24 +482,6 @@ nb::tuple MutableSpanToNbTuple(absl::Span<nb::object> xs) {
   return out;
 }
 
-template <typename IntType>
-static py::tuple IntSpanToTupleHelper(absl::Span<IntType const> xs) {
-  py::tuple out(xs.size());
-  for (int i = 0; i < xs.size(); ++i) {
-    out[i] = py::int_(xs[i]);
-  }
-  return out;
-}
-
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int const> xs) {
-  return IntSpanToTupleHelper(xs);
-}
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int64_t const> xs) {
-  return IntSpanToTupleHelper(xs);
-}
-
 std::optional<CastToArrayResult> CastToArray(nb::handle h) {
   auto array =
       nb_numpy_ndarray::ensure(h, NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_ALIGNED);
diff --git a/third_party/xla/xla/python/types.h b/third_party/xla/xla/python/types.h
index 5cffba8376c61a..00f28dbdbc3607 100644
--- a/third_party/xla/xla/python/types.h
+++ b/third_party/xla/xla/python/types.h
@@ -26,11 +26,6 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/python/ifrt/dtype.h"
@@ -44,20 +39,15 @@ limitations under the License.
 namespace xla {
 
 // Converts a NumPy dtype to a PrimitiveType.
-absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(
-    const pybind11::dtype& np_type);
-
 absl::StatusOr<PrimitiveType> DtypeToPrimitiveType(const nb_dtype& np_type);
 
 // Converts a PrimitiveType to a Numpy dtype.
-absl::StatusOr<pybind11::dtype> PrimitiveTypeToDtype(PrimitiveType type);
 absl::StatusOr<nb_dtype> PrimitiveTypeToNbDtype(PrimitiveType type);
 
 // Converts an IFRT dtype to a NumPy dtype.
-absl::StatusOr<pybind11::dtype> IfrtDtypeToDtype(ifrt::DType dtype);
 absl::StatusOr<nb_dtype> IfrtDtypeToNbDtype(ifrt::DType dtype);
 
-StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
+absl::StatusOr<ifrt::DType> DtypeToIfRtDType(nb_dtype dtype);
 
 // Returns a Python buffer protocol (PEP 3118) format descriptor string for
 // `type`. Return nullptr if there is no suitable choice of format string.
@@ -115,22 +105,6 @@ std::vector<int64_t> StridesForShape(PrimitiveType element_type,
 absl::StatusOr<nanobind::object> LiteralToPython(
     std::shared_ptr<Literal> literal);
 
-// Converts a sequence of C++ ints to a Python tuple of ints.
-// Pybind11 by default converts a std::vector<T> to a Python list;
-// we frequently want a tuple instead e.g. for shapes.
-template <typename T>
-pybind11::tuple SpanToTuple(absl::Span<T const> xs) {
-  pybind11::tuple out(xs.size());
-  for (int i = 0; i < xs.size(); ++i) {
-    out[i] = pybind11::cast(xs[i]);
-  }
-  return out;
-}
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int const> xs);
-template <>
-pybind11::tuple SpanToTuple(absl::Span<int64_t const> xs);
-
 template <typename T>
 nanobind::tuple SpanToNbTuple(absl::Span<T const> xs) {
   nanobind::tuple out =
@@ -145,24 +119,6 @@ nanobind::tuple SpanToNbTuple(absl::Span<T const> xs) {
 // references to the objects.
 nanobind::tuple MutableSpanToNbTuple(absl::Span<nanobind::object> xs);
 
-// Converts a Python iterable/sequence of T to std::vector<T>
-template <typename T>
-std::vector<T> IterableToVector(const pybind11::iterable& iterable) {
-  std::vector<T> output;
-  for (auto item : iterable) {
-    output.push_back(item.cast<T>());
-  }
-  return output;
-}
-template <typename T>
-std::vector<T> SequenceToVector(const pybind11::sequence& sequence) {
-  std::vector<T> output;
-  output.reserve(sequence.size());
-  for (auto item : sequence) {
-    output.push_back(item.cast<T>());
-  }
-  return output;
-}
 
 template <typename T>
 std::vector<T> IterableToVector(const nanobind::iterable& iterable) {
@@ -194,81 +150,6 @@ std::optional<CastToArrayResult> CastToArray(nanobind::handle h);
 
 }  // namespace xla
 
-// This namespace is a documented pybind11 extension point.
-// Caution: Unusually for Google code, this code uses C++ exceptions because
-// they are the only mechanism for reporting cast failures to pybind11. However,
-// the exceptions are local to the binding code.
-namespace pybind11 {
-namespace detail {
-
-// Literals.
-// Literal data can be passed to XLA as a NumPy array; its value can be
-// cast to an xla::BorrowingLiteral or xla::LiteralSlice in a zero-copy way.
-// We don't have any literal -> numpy conversions here, since all the methods
-// that want to return arrays build Python objects directly.
-
-template <>
-struct type_caster<xla::BorrowingLiteral> {
- public:
-  PYBIND11_TYPE_CASTER(xla::BorrowingLiteral, _("xla::BorrowingLiteral"));
-
-  // Pybind appears to keep type_casters alive until the callee has run.
-  absl::InlinedVector<pybind11::array, 1> arrays;
-
-  bool load(handle input, bool) {
-    // TODO(b/79707221): support nested tuples if/when XLA adds support for
-    // nested BorrowingLiterals.
-    if (pybind11::isinstance<pybind11::tuple>(input)) {
-      pybind11::tuple tuple =
-          pybind11::reinterpret_borrow<pybind11::tuple>(input);
-      std::vector<xla::Shape> shapes;
-      std::vector<const char*> buffers;
-      arrays.reserve(tuple.size());
-      shapes.reserve(tuple.size());
-      buffers.reserve(tuple.size());
-      for (pybind11::handle entry : tuple) {
-        auto c = xla::CastToArray(entry.ptr());
-        if (!c) {
-          return false;
-        }
-        arrays.push_back(reinterpret_borrow<object>(c->array.ptr()));
-        buffers.push_back(c->buf_ptr);
-        shapes.push_back(c->shape);
-      }
-      value = xla::BorrowingLiteral(buffers,
-                                    xla::ShapeUtil::MakeTupleShape(shapes));
-    } else {
-      auto c = xla::CastToArray(input.ptr());
-      if (!c) {
-        return false;
-      }
-      arrays.push_back(reinterpret_borrow<object>(c->array.ptr()));
-      value = xla::BorrowingLiteral(c->buf_ptr, c->shape);
-    }
-    return true;
-  }
-};
-
-template <>
-struct type_caster<xla::LiteralSlice> {
- public:
-  PYBIND11_TYPE_CASTER(xla::LiteralSlice, const_name("xla::LiteralSlice"));
-
-  // Pybind appears to keep type_casters alive until the callee has run.
-  type_caster<xla::BorrowingLiteral> literal_caster;
-
-  bool load(handle handle, bool convert) {
-    if (!literal_caster.load(handle, convert)) {
-      return false;
-    }
-    value = static_cast<const xla::BorrowingLiteral&>(literal_caster);
-    return true;
-  }
-};
-
-}  // namespace detail
-}  // namespace pybind11
-
 namespace nanobind {
 namespace detail {
 
diff --git a/third_party/xla/xla/python/util.h b/third_party/xla/xla/python/util.h
index 58d986167bc441..3e9f26d2bbd086 100644
--- a/third_party/xla/xla/python/util.h
+++ b/third_party/xla/xla/python/util.h
@@ -19,74 +19,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "xla/pjrt/pjrt_client.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/status.h"
 
 namespace xla {
 
-template <typename T>
-bool is_pybind_reinterpret_cast_ok(pybind11::handle h) {
-  static pybind11::detail::type_info* const type_info = []() {
-    auto* type_info =
-        pybind11::detail::get_type_info(typeid(T), /*throw_if_missing=*/false);
-    CHECK(type_info);
-    CHECK(type_info->simple_type);
-    return type_info;
-  }();
-  PyTypeObject* srctype = Py_TYPE(h.ptr());
-  // Exact type match.
-  if (srctype == type_info->type) {
-    return true;
-  }
-  // If we have a subtype, then look for a base type that matches.
-  if (PyType_IsSubtype(srctype, type_info->type)) {
-    const auto& bases = pybind11::detail::all_type_info(srctype);
-    for (auto* base : bases) {
-      if (PyType_IsSubtype(base->type, type_info->type)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Faster version of the pybind11 cast x.cast<T*>.
-// pybind11's cast is fairly slow because it looks up the type information
-// in a global hash table. It's not a particularly fast hash table and the
-// lookup is pointless when we know the target type and can cache the lookup.
-// This function does depend on a number of pybind11 internals;
-// if it ever bitrots, one option is to replace it with a pybind11 cast.
-// Return nullptr if the cast fails.
-template <typename T>
-T* fast_cast(pybind11::handle h) {
-  if (!is_pybind_reinterpret_cast_ok<T>(h)) {
-    // Fall back to pybind11's usual cast.
-    return h.cast<T*>();
-  }
-  auto* instance = reinterpret_cast<pybind11::detail::instance*>(h.ptr());
-  if (instance->simple_layout) {
-    return reinterpret_cast<T*>(instance->simple_value_holder[0]);
-  } else {
-    return reinterpret_cast<T*>(
-        pybind11::detail::values_and_holders(instance).begin()->value_ptr());
-  }
-}
-
-// Issues a Python deprecation warning. Throws a C++ exception if issuing the
-// Python warning causes a Python exception to be raised.
-template <typename... Args>
-void PythonDeprecationWarning(const absl::FormatSpec<Args...>& format,
-                              const Args&... args) {
-  if (PyErr_WarnEx(PyExc_DeprecationWarning,
-                   absl::StrFormat(format, args...).c_str(), 1) < 0) {
-    throw pybind11::error_already_set();
-  }
-}
-
 // Requests if given buffers are ready, awaits for results and returns OK if
 // all of the buffers are ready or the last non-ok status.
 Status AwaitBuffersReady(absl::Span<ifrt::Array* const> ifrt_arrays);
diff --git a/third_party/xla/xla/python/weakref_lru_cache_test.py b/third_party/xla/xla/python/weakref_lru_cache_test.py
index e6a5c40326fb85..a3b3d5ee6bf5cf 100644
--- a/third_party/xla/xla/python/weakref_lru_cache_test.py
+++ b/third_party/xla/xla/python/weakref_lru_cache_test.py
@@ -126,8 +126,8 @@ def __hash__(self):
 
     cache = xla_client.weakref_lru_cache(lambda: None, lambda x, y: y, 2048)
     wrkey = WRKey()
-    for _ in range(3):
-      with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError):
+      for _ in range(100):
         cache(wrkey, CrashingKey())
 
 
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 05752d3ff715e1..cde953367c498d 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -13,14 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/python/xla.h"
-
 #include <Python.h>
 
 #include <cstdint>
-#include <exception>
 #include <functional>
-#include <map>
 #include <memory>
 #include <optional>
 #include <set>
@@ -29,45 +25,38 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-// clang-format off
 #include "absl/base/casts.h"
-// Must be included first
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "third_party/nanobind/include/nanobind/nanobind.h"
 #include "third_party/nanobind/include/nanobind/nb_defs.h"
+#include "third_party/nanobind/include/nanobind/stl/function.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/optional.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/pair.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/set.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/shared_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
+#include "third_party/nanobind/include/nanobind/stl/unique_ptr.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/attr.h"  // from @pybind11
-#include "pybind11/cast.h"  // from @pybind11
-#include "pybind11/detail/common.h"  // from @pybind11
-#include "pybind11/numpy.h"  // from @pybind11
-#include "pybind11/pybind11.h"  // from @pybind11
-#include "pybind11/pytypes.h"  // from @pybind11
-#include "pybind11/stl.h"  // from @pybind11
-#include "pybind11/stl_bind.h"  // from @pybind11
-#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
 #include "xla/ffi/ffi_api.h"
-#include "xla/layout_util.h"
 #include "xla/pjrt/c/pjrt_c_api.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt_proxy/client/py_module.h"
 #include "xla/python/py_client.h"
 #include "xla/service/cpu/collectives_interface.h"
-#include "tsl/python/lib/core/numpy.h"  //NOLINT
+#include "xla/tsl/python/lib/core/numpy.h"  //NOLINT
 #ifdef XLA_PYTHON_ENABLE_GPU
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #endif  // XLA_PYTHON_ENABLE_GPU
@@ -79,23 +68,28 @@ limitations under the License.
 #include "xla/pjrt/cpu/gloo_kv_store.h"
 #endif  // __linux__
 
-#include "xla/literal.h"
+#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
+#include "xla/pjrt/cpu/mpi_collectives.h"
+#endif  // !_WIN32 && !PLATFORM_GOOGLE
+
 #include "xla/pjrt/cpu/cpu_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/exceptions.h"
+#include "xla/pjrt/gpu/gpu_helpers.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
-#include "xla/pjrt/status_casters.h"
 #include "xla/python/custom_call_sharding.h"
 #include "xla/python/dlpack.h"
 #include "xla/python/jax_jit.h"
-#include "xla/python/logging.h"
+#include "xla/python/logging.h"  // IWYU pragma: keep
 #include "xla/python/mlir.h"
 #include "xla/python/nb_absl_flat_hash_map.h"  // IWYU pragma: keep
 #include "xla/python/nb_absl_span.h"  // IWYU pragma: keep
+#include "xla/python/nb_class_ptr.h"
 #include "xla/python/ops.h"
 #include "xla/python/outfeed_receiver_py.h"
 #include "xla/python/pjit.h"
@@ -105,22 +99,20 @@ limitations under the License.
 #include "xla/python/profiler.h"
 #include "xla/python/py_array.h"
 #include "xla/python/py_compile_only_client.h"
+#include "xla/python/py_device.h"
 #include "xla/python/py_device_list.h"
 #include "xla/python/py_executable.h"
+#include "xla/python/py_memory_space.h"
 #include "xla/python/python_ref_manager.h"
 #include "xla/python/pytree.h"
 #include "xla/python/sharding.h"
 #include "xla/python/traceback.h"
 #include "xla/python/transfer_guard_lib.h"
-#include "xla/python/types.h"
-#include "xla/python/util.h"
 #include "xla/python/weakref_lru_cache.h"
 #include "xla/python/xla_compiler.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
 #include "tsl/distributed_runtime/preemption/preemption_sync_manager.h"
 #include "tsl/platform/platform.h"
+#include "tsl/platform/status.h"
 
 // TODO(phawkins): remove host_id properties after JAX is update to avoid them.
 
@@ -128,7 +120,6 @@ namespace xla {
 namespace {
 
 namespace nb = nanobind;
-namespace py = pybind11;
 
 bool IsOptimizedBuild() {
 #if NDEBUG
@@ -168,40 +159,21 @@ bool IsSanitized() { return IsAsan() || IsMsan() || IsTsan(); }
 
 }  // namespace
 
-static void Init(py::module_& m) {
+NB_MODULE(xla_extension, m_nb) {
   // Initialize ABSL logging because code within XLA uses it.
 #ifndef PLATFORM_GOOGLE
   InitializeAbslLogging();
 #endif  // PLATFORM_GOOGLE
 
-  // Normally this would happen at the start of NB_MODULE, but since this is a
-  // pybind11 module we have to do this ourselves.
-  nb::detail::init(NB_DOMAIN_STR);
-
   // We seem to get a fair number of leak warnings from nanobind. It's unclear
   // whether these are false positives or not.
   nb::set_leak_warnings(false);
 
   tsl::ImportNumpy();
 
-  nb::module_ m_nb = nb::cast<nb::module_>(nb::borrow(m.ptr()));
-
   // Exceptions
-  py::register_exception<XlaRuntimeError>(m, "XlaRuntimeError",
-                                          PyExc_RuntimeError);
-
-  // TODO(phawkins): use nb::exception<> once we have migrated all the pybind11
-  // code to nanobind. We use nb::register_exception_translator because we don't
-  // want to define the exception twice.
-  nb::register_exception_translator(
-      [](const std::exception_ptr& p, void* payload) {
-        try {
-          std::rethrow_exception(p);
-        } catch (const XlaRuntimeError& e) {
-          PyErr_SetString(reinterpret_cast<PyObject*>(payload), e.what());
-        }
-      },
-      nb::getattr(m_nb, "XlaRuntimeError").ptr());
+  nb::exception<XlaRuntimeError> xla_runtime_error(m_nb, "XlaRuntimeError",
+                                                   PyExc_RuntimeError);
 
   // Types
   nb::enum_<PrimitiveType>(m_nb, "PrimitiveType")
@@ -235,430 +207,43 @@ static void Init(py::module_& m) {
   // Must be before PyClient.compile.
   BuildXlaCompilerSubmodule(m_nb);
 
-  py::class_<PjRtDevice, ClientAndPtr<PjRtDevice>> device(
-      m, "Device",
-      "A descriptor of an available device.\n\nSubclasses are used to "
-      "represent specific types of devices, e.g. CPUs, GPUs. Subclasses may "
-      "have additional properties specific to that device type.");
-  device
-      .def_property_readonly(
-          "id", &PjRtDevice::id,
-          "Integer ID of this device.\n\nUnique across all available devices "
-          "of this type, including remote devices on multi-host platforms.")
-      .def_property_readonly(
-          "process_index", &PjRtDevice::process_index,
-          "Integer index of this device's process.\n\n"
-          "This is always 0 except on multi-process platforms.")
-      .def_property_readonly("host_id", &PjRtDevice::process_index,
-                             "Deprecated; please use process_index")
-      .def_property_readonly("task_id", &PjRtDevice::process_index,
-                             "Deprecated; please use process_index")
-      .def_property_readonly("platform",
-                             [](const ClientAndPtr<PjRtDevice>& device) {
-                               // TODO(phawkins): this is a temporary backwards
-                               // compatibility shim. We changed the name PJRT
-                               // reports for GPU platforms to "cuda" or "rocm",
-                               // but we haven't yet updated JAX clients that
-                               // expect "gpu". Migrate users and remove this
-                               // code.
-                               if (device.client()->platform_name() == "cuda" ||
-                                   device.client()->platform_name() == "rocm") {
-                                 return absl::string_view("gpu");
-                               } else {
-                                 return device.client()->platform_name();
-                               }
-                             })
-      .def_property_readonly("device_kind", &PjRtDevice::device_kind)
-      .def_property_readonly("client",
-                             [](const ClientAndPtr<PjRtDevice>& device) {
-                               return device.client();
-                             })
-      .def_property_readonly(
-          "local_hardware_id",
-          [](const ClientAndPtr<PjRtDevice>& device) -> std::optional<int> {
-            int local_hardware_id = device->local_hardware_id();
-            if (local_hardware_id == -1) {
-              return std::nullopt;
-            }
-            return local_hardware_id;
-          },
-          "Opaque hardware ID, e.g., the CUDA device number. In general, not "
-          "guaranteed to be dense, and not guaranteed to be defined on all "
-          "platforms.")
-      .def("__str__", &PjRtDevice::DebugString)
-      .def("__repr__", &PjRtDevice::ToString)
-      .def("transfer_to_infeed",
-           [](PjRtDevice& device, py::handle literal_py) {
-             // TODO(phawkins): just accept a Shape argument after nanobind
-             // transition is complete.
-             // We use a type caster directly because we need the value to
-             // alive until the transfer completes.
-             nb::detail::type_caster<LiteralSlice> literal_caster;
-             if (!literal_caster.from_python(literal_py.ptr(), 0, nullptr)) {
-               throw py::cast_error();
-             }
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             xla::ThrowIfError(device.TransferToInfeed(literal_caster.value));
-           })
-      .def("transfer_from_outfeed",
-           [](PjRtDevice& device, py::handle shape_py) -> py::object {
-             // TODO(phawkins): just accept a Shape argument after nanobind
-             // transition is complete.
-             Shape shape = nb::cast<Shape>(nb::borrow(shape_py.ptr()));
-             GlobalPyRefManager()->CollectGarbage();
-             std::shared_ptr<Literal> literal;
-             {
-               py::gil_scoped_release gil_release;
-               ShapeUtil::ForEachMutableSubshape(
-                   &shape, [](Shape* subshape, const ShapeIndex&) {
-                     if (!subshape->has_layout()) {
-                       LayoutUtil::SetToDefaultLayout(subshape);
-                     }
-                   });
-               literal = std::make_shared<Literal>(shape);
-               xla::ThrowIfError(device.TransferFromOutfeed(literal.get()));
-             }
-             nb::object out = ValueOrThrow(LiteralToPython(std::move(literal)));
-             return py::reinterpret_steal<py::object>(out.release().ptr());
-           })
-      .def(
-          "memory",
-          [](const ClientAndPtr<PjRtDevice>& device, const std::string& kind) {
-            return jax::GetMemory(device, kind);
-          },
-          py::arg("kind"))
-      // Returns the default memory of a device.
-      .def("default_memory",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             auto* memory_space =
-                 xla::ValueOrThrow(device->default_memory_space());
-             return WrapWithClient(device.client(), memory_space);
-           })
-      // Returns all the memories that a device can address.
-      .def("addressable_memories",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             std::vector<ClientAndPtr<PjRtMemorySpace>> memory_spaces;
-             auto span = device->memory_spaces();
-             memory_spaces.reserve(span.size());
-             for (auto* memory_space : span) {
-               memory_spaces.push_back(
-                   WrapWithClient(device.client(), memory_space));
-             }
-             return memory_spaces;
-           })
-      .def("live_buffers",
-           [](const ClientAndPtr<PjRtDevice>& device) {
-             PythonDeprecationWarning(
-                 "Per device live_buffers() is going to be deprecated. Please "
-                 "use the jax.live_arrays() for jax.Arrays instead.");
-             return py::list();
-           })
-      .def(
-          "memory_stats",
-          [](const PjRtDevice& device)
-              -> std::optional<std::map<std::string, int64_t>> {
-            GlobalPyRefManager()->CollectGarbage();
-            xla::StatusOr<tsl::AllocatorStats> maybe_stats =
-                device.GetAllocatorStats();
-            if (absl::IsUnimplemented(maybe_stats.status())) {
-              return std::nullopt;
-            }
-            // Raise error if any status other than Unimplemented is returned.
-            ThrowIfError(maybe_stats.status());
-
-            std::map<std::string, int64_t> result;
-            result["num_allocs"] = maybe_stats->num_allocs;
-            result["bytes_in_use"] = maybe_stats->bytes_in_use;
-            result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
-            result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
-            if (maybe_stats->bytes_limit) {
-              result["bytes_limit"] = *maybe_stats->bytes_limit;
-            }
-            result["bytes_reserved"] = maybe_stats->bytes_reserved;
-            result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
-            if (maybe_stats->bytes_reservable_limit) {
-              result["bytes_reservable_limit"] =
-                  *maybe_stats->bytes_reservable_limit;
-            }
-            result["largest_free_block_bytes"] =
-                maybe_stats->largest_free_block_bytes;
-            if (maybe_stats->pool_bytes) {
-              result["pool_bytes"] = *maybe_stats->pool_bytes;
-            }
-            if (maybe_stats->peak_pool_bytes) {
-              result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
-            }
-            return result;
-          },
-          "Returns memory statistics for this device keyed by name. May not be "
-          "implemented on all platforms, and different platforms may return "
-          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
-          "usually available. Intended for diagnostic use.")
-      .def("get_stream_for_external_ready_events",
-           xla::ValueOrThrowWrapper(
-               &PjRtDevice::GetStreamForExternalReadyEvents));
-  static PyMethodDef get_attr_method = {
-      "__getattr__",
-      +[](PyObject* self, PyObject* args) -> PyObject* {
-        PyObject* key;
-        if (!PyArg_ParseTuple(args, "O", &key)) {
-          PyErr_SetString(PyExc_TypeError, "__getattr__ must take 1 argument.");
-          return nullptr;
-        }
-        try {
-          auto device = py::cast<PjRtDevice*>(py::handle(self));
-          auto name = py::cast<std::string>(py::handle(key));
-          const auto& attrs = device->Attributes();
-          auto it = attrs.find(name);
-          if (it != attrs.end()) {
-            auto result =
-                std::visit([](auto&& v) { return py::cast(v); }, it->second);
-            return result.release().ptr();
-          }
-          PyErr_SetNone(PyExc_AttributeError);
-          return nullptr;
-        } catch (std::exception& e) {
-          PyErr_Format(PyExc_SystemError,
-                       "Some unhandled pybind11 exception: %s", e.what());
-          return nullptr;
-        } catch (...) {
-          PyErr_SetString(PyExc_SystemError,
-                          "Some unhandled pybind11 exception.");
-          return nullptr;
-        }
-      },
-      METH_VARARGS,
-      nullptr,
-  };
-  device.attr("__getattr__") =
-      py::reinterpret_steal<py::object>(PyDescr_NewMethod(
-          reinterpret_cast<PyTypeObject*>(device.ptr()), &get_attr_method));
-
-  py::class_<PjRtMemorySpace, ClientAndPtr<PjRtMemorySpace>> memory_space(
-      m, "Memory");
-  memory_space
-      .def_property_readonly(
-          "process_index",
-          [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-            return memory_space.client()->process_index();
-          })
-      .def_property_readonly(
-          "platform",
-          [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-            // TODO(phawkins): this is a temporary backwards
-            // compatibility shim. We changed the name PJRT
-            // reports for GPU platforms to "cuda" or "rocm",
-            // but we haven't yet updated JAX clients that
-            // expect "gpu". Migrate users and remove this
-            // code.
-            if (memory_space.client()->platform_name() == "cuda" ||
-                memory_space.client()->platform_name() == "rocm") {
-              return absl::string_view("gpu");
-            } else {
-              return memory_space.client()->platform_name();
-            }
-          })
-      .def_property_readonly("kind", &PjRtMemorySpace::memory_space_kind)
-      .def("__str__", &PjRtMemorySpace::DebugString)
-      .def("__repr__", &PjRtMemorySpace::ToString)
-      // Returns the devices that can address this `Memory`.
-      .def("addressable_by_devices",
-           [](const ClientAndPtr<PjRtMemorySpace>& memory_space) {
-             std::vector<ClientAndPtr<PjRtDevice>> devices;
-             auto span = memory_space->devices();
-             devices.reserve(span.size());
-             for (PjRtDevice* device : span) {
-               devices.push_back(WrapWithClient(memory_space.client(), device));
-             }
-             return devices;
-           });
+  PyDevice::RegisterPythonType(m_nb);
+  PyMemorySpace::RegisterPythonType(m_nb);
+  PyClient::RegisterPythonTypes(m_nb);
 
-  py::class_<PjRtLayout>(m, "PjRtLayout")
+  nb::class_<PjRtLayout>(m_nb, "PjRtLayout")
       .def("__str__", &PjRtLayout::ToString)
       .def("__eq__", [](const PjRtLayout& layout,
                         const PjRtLayout& other) { return layout == other; })
       .def("__hash__",
-           [](const PjRtLayout& layout) { return absl::HashOf(layout); })
-      .def(py::pickle(
-          [](const PjRtLayout& layout) -> py::tuple {
-            StatusOr<std::string> serialized = layout.Serialize();
-            ThrowIfError(serialized.status());
-            return py::make_tuple(py::bytes(*serialized));
-          },
-          [](py::tuple t) {
-            // TODO(b/328671718): don't assume PjRtXlaLayout. We probably want a
-            // generic method on PjRtCompiler instead, although we'll have
-            // somehow have to attach a compiler to this PjRtLayout (something
-            // like ClientAndPtr).
-            StatusOr<PjRtXlaLayout> layout =
-                PjRtXlaLayout::Deserialize(t[0].cast<std::string>());
-            ThrowIfError(layout.status());
-            return std::unique_ptr<PjRtLayout>(
-                new PjRtXlaLayout(std::move(*layout)));
-          }));
-
-  // Local XLA client methods.
-
-  py::enum_<PjRtClient::HostBufferSemantics>(m, "HostBufferSemantics")
-      .value("IMMUTABLE_ONLY_DURING_CALL",
-             PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall)
-      .value("IMMUTABLE_UNTIL_TRANSFER_COMPLETES",
-             PjRtClient::HostBufferSemantics::kImmutableUntilTransferCompletes)
-      .value("ZERO_COPY", PjRtClient::HostBufferSemantics::kZeroCopy);
-
-  jax::BuildWeakrefLRUCacheAPI(m_nb);
-
-  py::class_<PyClient, std::shared_ptr<PyClient>> py_local_client(m, "Client");
-  py_local_client.def_property_readonly("platform", &PyClient::platform_name)
-      .def_property_readonly("platform_version", &PyClient::platform_version)
-      .def_property_readonly("runtime_type", &PyClient::runtime_type)
-      .def("device_count", &PyClient::device_count)
-      .def("local_device_count", &PyClient::addressable_device_count)
-      .def("devices", &PyClient::Devices)
-      .def("local_devices", &PyClient::LocalDevices)
-      .def("device_from_local_hardware_id",
-           xla::ValueOrThrowWrapper(&PyClient::DeviceFromLocalHardwareId))
-      // TODO(phawkins): revert to the following after nanobind transition is
-      // complete
-      // .def("live_executables", &PyClient::LiveExecutables)
-      // .def("live_arrays", &PyClient::LiveArrays)
-      // .def("live_buffers", &PyClient::LiveArrays)
-      .def("live_executables",
-           [](PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveExecutables()).release().ptr());
-           })
-      .def("live_arrays",
-           [](const PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveArrays()).release().ptr());
+           [](const PjRtLayout& layout) { return absl::HashOf(layout); });
+
+  nb::class_<PjRtXlaLayout, PjRtLayout>(m_nb, "PjRtXlaLayout")
+      .def("__getstate__",
+           [](const PjRtXlaLayout& layout) -> nb::tuple {
+             absl::StatusOr<std::string> serialized = layout.Serialize();
+             ThrowIfError(serialized.status());
+             return nb::make_tuple(
+                 nb::bytes(serialized->data(), serialized->size()));
            })
-      .def("live_buffers",
-           [](const PyClient& client) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(client.LiveArrays()).release().ptr());
-           })
-      .def("process_index", &PyClient::process_index)
-      .def("host_id", &PyClient::process_index)
-      .def("task_id", &PyClient::process_index)
-      .def(
-          "buffer_from_pyval",
-          [](py::handle py_client, py::handle argument, py::handle py_device,
-             bool force_copy,
-             PjRtClient::HostBufferSemantics host_buffer_semantics) {
-            PyClient* client = fast_cast<PyClient>(py_client);
-            PjRtDevice* device = py_device.is_none()
-                                     ? nullptr
-                                     : fast_cast<PjRtDevice>(py_device);
-            return ValueOrThrow(client->BufferFromPyval(
-                argument, device, force_copy, host_buffer_semantics));
-          },
-          py::arg("argument"), py::arg("device") = nullptr,
-          py::arg("force_copy") = false,
-          py::arg("host_buffer_semantics") =
-              PjRtClient::HostBufferSemantics::kZeroCopy)
-      .def("make_cross_host_receive_buffers",
-           xla::ValueOrThrowWrapper(&PyClient::MakeCrossHostReceiveBuffers),
-           py::arg("shapes"), py::arg("device"))
-      .def(
-          "compile",
-          [](PyClient& self, std::string mlir_module, py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            // TODO(phawkins): just wrap PyClient::Compile directly when the
-            // nanobind transition is complete.
-            CompileOptions options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            return py::reinterpret_steal<py::object>(
-                nb::cast(ValueOrThrow(self.Compile(mlir_module, options,
-                                                   host_callbacks)))
-                    .release()
-                    .ptr());
-          },
-          py::arg("computation"), py::arg("compile_options") = py::none(),
-          py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("serialize_executable",
-           // TODO(phawkins): revert to the following after nanobind transition
-           // xla::ValueOrThrowWrapper(&PyClient::SerializeExecutable))
-           [](const PyClient& self, py::object executable_py) {
-             const PyLoadedExecutable* executable =
-                 nb::cast<const PyLoadedExecutable*>(
-                     nb::handle(executable_py.ptr()));
-             return xla::ValueOrThrow(self.SerializeExecutable(*executable));
-           })
-      .def(
-          "deserialize_executable",
-          // TODO(phawkins): revert to the following after nanobind transition
-          // is complete
-          // xla::ValueOrThrowWrapper(&PyClient::DeserializeExecutable),
-          [](PyClient& self, const std::string& serialized,
-             py::object options_py,
-             std::vector<pybind11::capsule> host_callbacks) {
-            std::optional<CompileOptions> options;
-            if (!options_py.is_none()) {
-              try {
-                options =
-                    nb::cast<CompileOptions>(nb::handle(options_py.ptr()));
-              } catch (std::exception& e) {
-                throw py::type_error(e.what());
-              }
-            }
-            auto out = nb::cast(xla::ValueOrThrow(self.DeserializeExecutable(
-                serialized, options, host_callbacks)));
-            return py::reinterpret_steal<py::object>(out.release().ptr());
-          },
-          py::arg("serialized"), py::arg("compile_options") = py::none(),
-          py::arg("host_callbacks") = std::vector<py::capsule>())
-      .def("heap_profile", xla::ValueOrThrowWrapper(&PyClient::HeapProfile))
-      // TODO(zhangqiaorjc): Experimental.
-      .def("defragment",
-           [](PyClient& self) { xla::ThrowIfError(self.Defragment()); })
-      .def("get_emit_python_callback_descriptor",
-           xla::ValueOrThrowWrapper(&PyClient::GetEmitPythonCallbackDescriptor),
-           py::arg("callable"), py::arg("operand_shapes"),
-           py::arg("result_shapes") = py::none())
-      .def(
-          "make_python_callback_from_host_send_and_recv",
-          // TODO(phawkins): revert to
-          //  xla::ValueOrThrowWrapper(
-          //      &PyClient::MakePythonCallbackUsingHostSendAndRecv),
-          // when the nanobind transition is done.
-          [](PyClient& self, py::function callable, py::object operand_shapes,
-             py::object result_shapes,
-             absl::Span<uint16_t const> send_channel_ids,
-             absl::Span<uint16_t const> recv_channel_ids,
-             py::function serializer) {
-            return ValueOrThrow(self.MakePythonCallbackUsingHostSendAndRecv(
-                callable,
-                nb::cast<std::vector<Shape>>(nb::handle(operand_shapes.ptr())),
-                nb::cast<std::vector<Shape>>(nb::handle(result_shapes.ptr())),
-                send_channel_ids, recv_channel_ids, serializer));
-          },
-
-          py::arg("callable"), py::arg("operand_shapes"),
-          py::arg("result_shapes"), py::arg("send_channel_ids"),
-          py::arg("recv_channel_ids"), py::arg("serializer") = py::none())
-      .def("__getattr__", [](PyClient& client, std::string name) -> py::object {
-        const auto& attrs = client.attributes();
-        auto it = attrs.find(name);
-        if (it != attrs.end()) {
-          return std::visit([](auto&& v) { return py::cast(v); }, it->second);
-        }
-        throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
+      .def("__setstate__", [](PjRtXlaLayout* self, nb::tuple t) {
+        // TODO(b/328671718): don't assume PjRtXlaLayout. We probably want a
+        // generic method on PjRtCompiler instead, although we'll have
+        // somehow have to attach a compiler to this PjRtLayout (something
+        // like ClientAndPtr).
+        nb::bytes serialized = nb::cast<nb::bytes>(t[0]);
+        absl::StatusOr<PjRtXlaLayout> layout = PjRtXlaLayout::Deserialize(
+            std::string_view(serialized.c_str(), serialized.size()));
+        ThrowIfError(layout.status());
+        new (self) PjRtXlaLayout(std::move(*layout));
       });
 
-  py::class_<xla::cpu::CollectivesInterface,
-             std::shared_ptr<xla::cpu::CollectivesInterface>>
-      cpu_collectives(m, "CpuCollectives");
+  jax::BuildWeakrefLRUCacheAPI(m_nb);
 
-  m.def(
+  nb::class_<xla::cpu::CollectivesInterface> cpu_collectives(m_nb,
+                                                             "CpuCollectives");
+
+  m_nb.def(
       "make_gloo_tcp_collectives",
       [](std::shared_ptr<DistributedRuntimeClient> distributed_client,
 
@@ -687,177 +272,209 @@ static void Init(py::module_& m) {
             "make_gloo_tcp_collectives only implemented for linux");
 #endif  // __linux__
       },
-      py::arg("distributed_client"), py::arg("hostname") = std::nullopt,
-      py::arg("interface") = std::nullopt);
+      nb::arg("distributed_client"), nb::arg("hostname").none() = std::nullopt,
+      nb::arg("interface").none() = std::nullopt);
+
+#if !defined(_WIN32) && !defined(PLATFORM_GOOGLE)
+  nb::class_<cpu::MpiCollectives> mpi_collectives(m_nb, "MpiCollectives",
+                                                  cpu_collectives);
+  mpi_collectives.def("Init", &cpu::MpiCollectives::Init);
+  mpi_collectives.def("Finalize", &cpu::MpiCollectives::Finalize);
+  m_nb.def("make_mpi_collectives",
+           []() -> std::shared_ptr<cpu::MpiCollectives> {
+             return std::make_shared<cpu::MpiCollectives>();
+           });
+#else   // !_WIN32 && !PLATFORM_GOOGLE
+  m_nb.def("make_mpi_collectives",
+           []() -> std::shared_ptr<xla::cpu::CollectivesInterface> {
+             throw xla::XlaRuntimeError(
+                 "make_mpi_collectives is not implemented for Windows");
+           });
+#endif  // !_WIN32 && !PLATFORM_GOOGLE
 
-  m.def(
+  m_nb.def(
       "get_tfrt_cpu_client",
       [](bool asynchronous,
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, int num_nodes,
          std::shared_ptr<xla::cpu::CollectivesInterface> collectives)
-          -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        CpuClientOptions options;
-        if (distributed_client != nullptr) {
-          options.kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                         /*key_prefix=*/"cpu:");
-          options.node_id = node_id;
-          options.num_nodes = num_nodes;
+          -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          CpuClientOptions options;
+          if (distributed_client != nullptr) {
+            options.kv_store =
+                GetDistributedKeyValueStore(distributed_client,
+                                            /*key_prefix=*/"cpu:");
+            options.node_id = node_id;
+            options.num_nodes = num_nodes;
+
+            options.collectives = std::move(collectives);
+          }
 
-          options.collectives = std::move(collectives);
+          options.asynchronous = asynchronous;
+          std::unique_ptr<PjRtClient> client =
+              xla::ValueOrThrow(GetTfrtCpuClient(options));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(client));
         }
-
-        options.asynchronous = asynchronous;
-        std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetTfrtCpuClient(options));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("asynchronous") = true, py::arg("distributed_client") = nullptr,
-      py::arg("node_id") = 0, py::arg("num_nodes") = 1,
-      py::arg("collectives") =
+      nb::arg("asynchronous") = true, nb::arg("distributed_client") = nullptr,
+      nb::arg("node_id") = 0, nb::arg("num_nodes") = 1,
+      nb::arg("collectives").none() =
           std::shared_ptr<xla::cpu::CollectivesInterface>());
-  m.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
-    xla::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
+  m_nb.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
+    absl::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
     return pjrt_api.ok();
   });
-  m.def(
+  m_nb.def(
       "load_pjrt_plugin",
       [](std::string platform_name, std::optional<std::string> library_path,
-         std::optional<py::capsule> c_api) -> py::capsule {
+         std::optional<nb::capsule> c_api) -> nb::capsule {
         if (library_path.has_value()) {
           const PJRT_Api* api = xla::ValueOrThrow(
               pjrt::LoadPjrtPlugin(platform_name, *library_path));
-          return py::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
+          return nb::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
         }
         if (absl::string_view(c_api->name()) != "pjrt_c_api") {
-          throw py::value_error(
+          throw nb::value_error(
               "c_api argument to load_pjrt_plugin is not a pjrt_c_api "
               "capsule.");
         }
         xla::ThrowIfError(pjrt::SetPjrtApi(
-            platform_name, static_cast<const PJRT_Api*>(*c_api)));
+            platform_name, static_cast<const PJRT_Api*>(c_api->data())));
         return *c_api;
       },
-      py::arg("platform_name"), py::arg("library_path") = std::nullopt,
-      py::arg("c_api") = std::nullopt);
-  m.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
+      nb::arg("platform_name"), nb::arg("library_path").none() = std::nullopt,
+      nb::arg("c_api").none() = std::nullopt);
+  m_nb.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
     return xla::ValueOrThrow(pjrt::IsPjrtPluginInitialized(platform_name));
   });
-  m.def("initialize_pjrt_plugin", [](std::string platform_name) {
+  m_nb.def("initialize_pjrt_plugin", [](std::string platform_name) {
     return xla::ThrowIfError(pjrt::InitializePjrtPlugin(platform_name));
   });
 
 #ifdef XLA_PYTHON_ENABLE_GPU
-  py::class_<GpuAllocatorConfig> alloc_config(m, "GpuAllocatorConfig");
-  alloc_config.def(py::init<>())
-      .def_readwrite("kind", &GpuAllocatorConfig::kind)
-      .def_readwrite("memory_fraction", &GpuAllocatorConfig::memory_fraction)
-      .def_readwrite("preallocate", &GpuAllocatorConfig::preallocate)
-      .def_readwrite("collective_memory_size",
-                     &GpuAllocatorConfig::collective_memory_size);
-  py::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
+  nb::class_<GpuAllocatorConfig> alloc_config(m_nb, "GpuAllocatorConfig");
+  alloc_config.def(nb::init<>())
+      .def_rw("kind", &GpuAllocatorConfig::kind)
+      .def_rw("memory_fraction", &GpuAllocatorConfig::memory_fraction)
+      .def_rw("preallocate", &GpuAllocatorConfig::preallocate)
+      .def_rw("collective_memory_size",
+              &GpuAllocatorConfig::collective_memory_size);
+  nb::enum_<GpuAllocatorConfig::Kind>(alloc_config, "Kind")
       .value("DEFAULT", GpuAllocatorConfig::Kind::kDefault)
       .value("PLATFORM", GpuAllocatorConfig::Kind::kPlatform)
       .value("BFC", GpuAllocatorConfig::Kind::kBFC)
       .value("CUDA_ASYNC", GpuAllocatorConfig::Kind::kCudaAsync);
 
-  m.def(
+  m_nb.def(
       "get_gpu_client",
       [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id, int num_nodes,
          std::optional<std::set<int>> allowed_devices,
          std::optional<std::string> platform_name,
-         std::optional<bool> mock = false) -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(distributed_client,
-                                                 /*key_prefix=*/"gpu:");
+         std::optional<bool> mock = false) -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+          if (distributed_client != nullptr) {
+            kv_store = GetDistributedKeyValueStore(distributed_client,
+                                                   /*key_prefix=*/"gpu:");
+          }
+          GpuClientOptions options;
+          options.allocator_config = allocator_config;
+          options.node_id = node_id;
+          options.num_nodes = num_nodes;
+          options.allowed_devices = allowed_devices;
+          options.platform_name = platform_name;
+          options.kv_store = kv_store;
+          options.enable_mock_nccl = mock.value_or(false);
+          std::unique_ptr<PjRtClient> pjrt_client =
+              xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(pjrt_client));
         }
-        GpuClientOptions options;
-        options.allocator_config = allocator_config;
-        options.node_id = node_id;
-        options.num_nodes = num_nodes;
-        options.allowed_devices = allowed_devices;
-        options.platform_name = platform_name;
-        options.kv_store = kv_store;
-        options.enable_mock_nccl = mock.value_or(false);
-        std::unique_ptr<PjRtClient> client =
-            xla::ValueOrThrow(GetStreamExecutorGpuClient(options));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("asynchronous") = true,
-      py::arg("allocator_config") = GpuAllocatorConfig(),
-      py::arg("distributed_client") = nullptr, py::arg("node_id") = 0,
-      py::arg("num_nodes") = 1, py::arg("allowed_devices") = std::nullopt,
-      py::arg("platform_name") = std::nullopt, py::arg("mock") = std::nullopt);
+      nb::arg("asynchronous") = true,
+      nb::arg("allocator_config") = GpuAllocatorConfig(),
+      nb::arg("distributed_client") = nullptr, nb::arg("node_id") = 0,
+      nb::arg("num_nodes") = 1,
+      nb::arg("allowed_devices").none() = std::nullopt,
+      nb::arg("platform_name").none() = std::nullopt,
+      nb::arg("mock").none() = std::nullopt);
 #endif  // XLA_PYTHON_ENABLE_GPU
 
-  m.def(
+  m_nb.def(
       "get_c_api_client",
       [](std::string platform_name,
          const absl::flat_hash_map<std::string, PjRtValueType>& options,
          std::shared_ptr<DistributedRuntimeClient> distributed_client)
-          -> std::shared_ptr<PyClient> {
-        py::gil_scoped_release gil_release;
-        std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
-        if (distributed_client != nullptr) {
-          kv_store = GetDistributedKeyValueStore(
-              distributed_client,
-              /*key_prefix=*/absl::StrCat(platform_name, ":"));
+          -> nb_class_ptr<PyClient> {
+        std::unique_ptr<ifrt::PjRtClient> ifrt_client;
+        {
+          nb::gil_scoped_release gil_release;
+          std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
+          if (distributed_client != nullptr) {
+            kv_store = GetDistributedKeyValueStore(
+                distributed_client,
+                /*key_prefix=*/absl::StrCat(platform_name, ":"));
+          }
+          std::unique_ptr<PjRtClient> c_api_client = xla::ValueOrThrow(
+              GetCApiClient(platform_name, options, kv_store));
+          ifrt_client = ifrt::PjRtClient::Create(std::move(c_api_client));
         }
-        std::unique_ptr<PjRtClient> c_api_client =
-            xla::ValueOrThrow(GetCApiClient(platform_name, options, kv_store));
-        return std::make_shared<PyClient>(
-            ifrt::PjRtClient::Create(std::move(c_api_client)));
+        return PyClient::Make(std::move(ifrt_client));
       },
-      py::arg("platform_name"),
-      py::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>(),
-      py::arg("distributed_client") = nullptr);
+      nb::arg("platform_name"),
+      nb::arg("options") = absl::flat_hash_map<std::string, PjRtValueType>(),
+      nb::arg("distributed_client").none() = nullptr);
   // TODO(b/322357665): Delete this method after TPU plugin changes to use the
   // standard registration.
-  m.def("get_default_c_api_topology",
-        [](std::string platform_name, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<PjRtTopologyDescription> {
-          return xla::ValueOrThrow(
-              GetCApiTopology(platform_name, topology_name, options));
-        });
-  m.def("get_c_api_topology",
-        [](py::capsule c_api, std::string topology_name,
-           const absl::flat_hash_map<std::string, PjRtValueType>& options)
-            -> std::shared_ptr<PjRtTopologyDescription> {
-          if (absl::string_view(c_api.name()) != "pjrt_c_api") {
-            throw py::value_error(
-                "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
-          }
-          return xla::ValueOrThrow(GetCApiTopology(
-              static_cast<const PJRT_Api*>(c_api), topology_name, options));
-        });
-  m.def("get_topology_for_devices",
-        [](std::vector<ClientAndPtr<PjRtDevice>> devices_and_clients) {
-          if (devices_and_clients.empty()) {
-            throw py::value_error(
-                "get_topology_for_devices requires >= 1 devices.");
-          }
-          auto client = devices_and_clients[0].client();
-          std::vector<PjRtDevice*> devices;
-          devices.reserve(devices_and_clients.size());
-          for (const ClientAndPtr<PjRtDevice>& device : devices_and_clients) {
-            if (device.get_client() != client.get()) {
-              throw py::value_error(
-                  "devices passed to get_topology_for_devices come from "
-                  "different clients.");
-            }
-            devices.push_back(device.get());
+  m_nb.def("get_default_c_api_topology",
+           [](std::string platform_name, std::string topology_name,
+              const absl::flat_hash_map<std::string, PjRtValueType>& options)
+               -> std::shared_ptr<PjRtTopologyDescription> {
+             return xla::ValueOrThrow(
+                 GetCApiTopology(platform_name, topology_name, options));
+           });
+  m_nb.def(
+      "get_c_api_topology",
+      [](nb::capsule c_api, std::string topology_name,
+         const absl::flat_hash_map<std::string, PjRtValueType>& options)
+          -> std::shared_ptr<PjRtTopologyDescription> {
+        if (absl::string_view(c_api.name()) != "pjrt_c_api") {
+          throw nb::value_error(
+              "Argument to get_c_api_topology was not a pjrt_c_api capsule.");
+        }
+        return xla::ValueOrThrow(
+            GetCApiTopology(static_cast<const PJRT_Api*>(c_api.data()),
+                            topology_name, options));
+      });
+  m_nb.def(
+      "get_topology_for_devices",
+      [](const std::vector<nb_class_ptr<PyDevice>>& py_devices) {
+        if (py_devices.empty()) {
+          throw nb::value_error(
+              "get_topology_for_devices requires >= 1 devices.");
+        }
+        auto client = py_devices[0]->client();
+        std::vector<PjRtDevice*> ifrt_devices;
+        ifrt_devices.reserve(py_devices.size());
+        for (const auto& py_device : py_devices) {
+          if (py_device->client().get() != client.get()) {
+            throw nb::value_error(
+                "devices passed to get_topology_for_devices come from "
+                "different clients.");
           }
-          return xla::ValueOrThrow(client->ifrt_client()->GetTopologyForDevices(
-              absl::MakeSpan(devices)));
-        });
+          ifrt_devices.push_back(py_device->device());
+        }
+        return xla::ValueOrThrow(client->ifrt_client()->GetTopologyForDevices(
+            absl::MakeSpan(ifrt_devices)));
+      });
 
   TF_CHECK_OK(PyArray::RegisterTypes(m_nb));
   jax::RegisterDeviceList(m_nb);
@@ -899,12 +516,7 @@ static void Init(py::module_& m) {
       .def("consume_token", &PyExecuteResults::ConsumeToken);
 
   nb::class_<PyLoadedExecutable>(m_nb, "LoadedExecutable")
-      .def_prop_ro(
-          "client",
-          // TODO(phawkins): directly wrap method after nanobind transition.
-          [](const PyLoadedExecutable& self) -> nb::object {
-            return nb::borrow(py::cast(self.client()).ptr());
-          })
+      .def_prop_ro("client", &PyLoadedExecutable::client)
       .def("local_logical_device_ids",
            [](PyLoadedExecutable* exec) {
              auto span = exec->addressable_device_logical_ids();
@@ -916,12 +528,7 @@ static void Init(py::module_& m) {
                    logical_device_id.replica, logical_device_id.partition));
              }
            })
-      // TODO(phawkins): directly wrap after nanobind transition
-      // .def("local_devices", &PyLoadedExecutable::AddressableDevices)
-      .def("local_devices",
-           [](const PyLoadedExecutable& self) {
-             return nb::borrow(py::cast(self.AddressableDevices()).ptr());
-           })
+      .def("local_devices", &PyLoadedExecutable::AddressableDevices)
       .def("size_of_generated_code_in_bytes",
            &PyLoadedExecutable::SizeOfGeneratedCodeInBytes)
       .def(
@@ -977,33 +584,31 @@ static void Init(py::module_& m) {
   });
   sharded_token.def("get_token", &PyShardedToken::GetPyToken);
 
-  m.def("buffer_to_dlpack_managed_tensor",
-        xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
-        py::arg("buffer"), py::arg("stream") = py::none());
-  m.def("dlpack_managed_tensor_to_buffer",
-        [](const pybind11::capsule& tensor, ClientAndPtr<PjRtDevice> device,
-           std::optional<std::intptr_t> stream) {
-          return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
-              tensor, device.get(), device.client(), stream));
-        });
+  m_nb.def("buffer_to_dlpack_managed_tensor",
+           xla::ValueOrThrowWrapper(BufferToDLPackManagedTensor),
+           nb::arg("buffer"), nb::arg("stream").none() = nb::none());
+  m_nb.def(
+      "dlpack_managed_tensor_to_buffer",
+      [](const nb::capsule& tensor, nb_class_ptr<PyDevice> device,
+         std::optional<std::intptr_t> stream) {
+        return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
+            tensor, device->device(), device->client(), stream));
+      },
+      nb::arg("dlpack"), nb::arg("device"), nb::arg("stream").none());
   // Legacy overload
-  m.def(
+  m_nb.def(
       "dlpack_managed_tensor_to_buffer",
-      [](const pybind11::capsule& tensor, std::shared_ptr<PyClient> cpu_client,
-         std::shared_ptr<PyClient> gpu_client) {
+      [](const nb::capsule& tensor,
+         std::optional<nb_class_ptr<PyClient>> cpu_client,
+         std::optional<nb_class_ptr<PyClient>> gpu_client) {
         return xla::ValueOrThrow(DLPackManagedTensorToBuffer(
             tensor, std::move(cpu_client), std::move(gpu_client)));
       },
-      py::arg("dlpack"), py::arg("cpu_backend") = nullptr,
-      py::arg("gpu_backend") = nullptr);
-  m.def("cuda_array_interface_to_buffer",
-        [](py::handle cai_py, std::shared_ptr<PyClient> cuda_client) {
-          // TODO(phawkins): simplify after nanobind transition is complete.
-          nb::dict cai = nb::cast<nb::dict>(nb::handle(cai_py.ptr()));
-          auto out = xla::ValueOrThrow(
-              CudaArrayInterfaceToBuffer(cai, std::move(cuda_client)));
-          return py::reinterpret_steal<py::object>(out.release().ptr());
-        });
+      nb::arg("dlpack"), nb::arg("cpu_backend").none() = nb::none(),
+      nb::arg("gpu_backend").none() = nb::none());
+  m_nb.def("cuda_array_interface_to_buffer",
+           xla::ValueOrThrowWrapper(CudaArrayInterfaceToBuffer));
+
   BuildProfilerSubmodule(m_nb);
   BuildOpsSubmodule(m_nb);
   BuildOutfeedReceiverSubmodule(m_nb);
@@ -1016,9 +621,12 @@ static void Init(py::module_& m) {
   BuildMlirSubmodule(m_nb);
   BuildCustomCallShardingPybindAPI(m_nb);
 
-  py::class_<tsl::PreemptionSyncManager,
-             std::unique_ptr<tsl::PreemptionSyncManager>>
-      preemption_sync_manager(m, "PreemptionSyncManager");
+  // The following uses python bindings for PyClient defined above using
+  // pybind11, and hence needs pybind11::module_ (not just nanobind::module_).
+  xla::ifrt::proxy::BuildIfrtProxySubmodule(m_nb);
+
+  nb::class_<tsl::PreemptionSyncManager> preemption_sync_manager(
+      m_nb, "PreemptionSyncManager");
   preemption_sync_manager
       .def(
           "initialize",
@@ -1028,32 +636,30 @@ static void Init(py::module_& m) {
                 xla::ValueOrThrow(client->GetCoordinationServiceAgent());
             xla::ThrowIfError(manager.Initialize(agent));
           },
-          py::arg("distributed_client"))
+          nb::arg("distributed_client"))
       .def("reached_sync_point",
            [](tsl::PreemptionSyncManager& manager, int step_counter) {
              return manager.ReachedSyncPoint(step_counter);
            });
-  m.def("create_preemption_sync_manager",
-        []() { return tsl::CreatePreemptionSyncManager(); });
+  m_nb.def("create_preemption_sync_manager",
+           []() { return tsl::CreatePreemptionSyncManager(); });
 
-  py::class_<DistributedRuntimeService,
-             std::unique_ptr<DistributedRuntimeService>>
-      distributed_runtime_service(m, "DistributedRuntimeService");
+  nb::class_<DistributedRuntimeService> distributed_runtime_service(
+      m_nb, "DistributedRuntimeService");
   distributed_runtime_service.def("shutdown",
                                   &DistributedRuntimeService::Shutdown,
-                                  py::call_guard<py::gil_scoped_release>());
-  py::class_<DistributedRuntimeClient,
-             std::shared_ptr<DistributedRuntimeClient>>
-      distributed_runtime_client(m, "DistributedRuntimeClient");
+                                  nb::call_guard<nb::gil_scoped_release>());
+  nb::class_<DistributedRuntimeClient> distributed_runtime_client(
+      m_nb, "DistributedRuntimeClient");
   distributed_runtime_client
       .def("connect",
            [](DistributedRuntimeClient& self) {
-             py::gil_scoped_release gil_release;
+             nb::gil_scoped_release gil_release;
              xla::ThrowIfError(self.Connect());
            })
       .def("shutdown",
            [](DistributedRuntimeClient& self) {
-             py::gil_scoped_release gil_release;
+             nb::gil_scoped_release gil_release;
              xla::ThrowIfError(self.Shutdown());
            })
       // This method assumes that the value is a Python string. Use
@@ -1063,32 +669,32 @@ static void Init(py::module_& m) {
           "blocking_key_value_get",
           [](DistributedRuntimeClient& client, std::string key,
              int64_t timeout_in_ms) {
-            py::gil_scoped_release gil_release;
+            nb::gil_scoped_release gil_release;
             return xla::ValueOrThrow(client.BlockingKeyValueGet(
                 key, absl::Milliseconds(timeout_in_ms)));
           },
-          py::arg("key"), py::arg("timeout_in_ms"))
+          nb::arg("key"), nb::arg("timeout_in_ms"))
       // Same as `blocking_key_value_get()`, but retrieves the raw Python byte
       // values explicitly.
       .def(
           "blocking_key_value_get_bytes",
           [](DistributedRuntimeClient& client, std::string key,
-             int64_t timeout_in_ms) -> py::bytes {
-            py::gil_scoped_release gil_release;
+             int64_t timeout_in_ms) -> nb::bytes {
+            nb::gil_scoped_release gil_release;
             std::string result = xla::ValueOrThrow(client.BlockingKeyValueGet(
                 key, absl::Milliseconds(timeout_in_ms)));
-            return py::bytes(result);
+            return nb::bytes(result.data(), result.size());
           },
-          py::arg("key"), py::arg("timeout_in_ms"))
+          nb::arg("key"), nb::arg("timeout_in_ms"))
       .def(
           "wait_at_barrier",
           [](DistributedRuntimeClient& client, std::string barrier_id,
              int64_t timeout_in_ms) {
-            py::gil_scoped_release gil_release;
+            nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.WaitAtBarrier(
                 barrier_id, absl::Milliseconds(timeout_in_ms)));
           },
-          py::arg("barrier_id"), py::arg("timeout_in_ms"))
+          nb::arg("barrier_id"), nb::arg("timeout_in_ms"))
       // The key must be a string, but the value can either be a Python string
       // or bytes object.
       // With Python string values, use `key_value_set()` and
@@ -1097,58 +703,70 @@ static void Init(py::module_& m) {
       // `blocking_key_value_get_bytes()`.
       .def(
           "key_value_set",
-          [](DistributedRuntimeClient& client, std::string key,
-             std::string value) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key,
+             std::string_view value) {
+            nb::gil_scoped_release gil_release;
             xla::ThrowIfError(client.KeyValueSet(key, value));
           },
-          py::arg("key"), py::arg("value"))
-      // The key must be a string, but the value must a Python bytes object.
+          nb::arg("key"), nb::arg("value"))
+      .def(
+          "key_value_set",
+          [](DistributedRuntimeClient& client, std::string_view key,
+             nb::bytes value) {
+            nb::gil_scoped_release gil_release;
+            xla::ThrowIfError(client.KeyValueSet(
+                key, std::string_view(value.c_str(), value.size())));
+          },
+          nb::arg("key"), nb::arg("value"))
+      // The key must be a string, but the value must a
+      // Python bytes object.
       // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.
       .def(
           "key_value_set_bytes",
-          [](DistributedRuntimeClient& client, std::string key,
-             py::bytes value) {
-            py::gil_scoped_release gil_release;
-            xla::ThrowIfError(client.KeyValueSet(key, value));
+          [](DistributedRuntimeClient& client, std::string_view key,
+             nb::bytes value) {
+            nb::gil_scoped_release gil_release;
+            xla::ThrowIfError(client.KeyValueSet(
+                key, std::string_view(value.c_str(), value.size())));
           },
-          py::arg("key"), py::arg("value"))
+          nb::arg("key"), nb::arg("value"))
       // Assumes that all values in the directory are Python strings.
       .def(
           "key_value_dir_get",
-          [](DistributedRuntimeClient& client, std::string key) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key) {
+            nb::gil_scoped_release gil_release;
             return xla::ValueOrThrow(client.KeyValueDirGet(key));
           },
-          py::arg("key"))
+          nb::arg("key"))
       // Assumes that all values in the directory are Python byte objects.
       // Same as `key_value_dir_get()`, but retrieves Python byte values
       // explicitly.
       .def(
           "key_value_dir_get_bytes",
-          [](DistributedRuntimeClient& client, std::string key)
-              -> std::vector<std::pair<std::string, py::bytes>> {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key)
+              -> std::vector<std::pair<std::string, nb::bytes>> {
+            nb::gil_scoped_release gil_release;
             std::vector<std::pair<std::string, std::string>> result =
                 xla::ValueOrThrow(client.KeyValueDirGet(key));
-            // Convert std::string values to py::bytes.
-            std::vector<std::pair<std::string, py::bytes>> kvs;
+            // Convert std::string values to nb::bytes.
+            std::vector<std::pair<std::string, nb::bytes>> kvs;
             kvs.reserve(result.size());
             for (const auto& kv : result) {
-              kvs.push_back(std::pair(kv.first, py::bytes(kv.second)));
+              kvs.push_back(std::pair(
+                  kv.first, nb::bytes(kv.second.data(), kv.second.size())));
             }
             return kvs;
           },
-          py::arg("key"))
+          nb::arg("key"))
       .def(
           "key_value_delete",
-          [](DistributedRuntimeClient& client, std::string key) {
-            py::gil_scoped_release gil_release;
+          [](DistributedRuntimeClient& client, std::string_view key) {
+            nb::gil_scoped_release gil_release;
             return client.KeyValueDelete(key);
           },
-          py::arg("key"));
+          nb::arg("key"));
 
-  m.def(
+  m_nb.def(
       "get_distributed_runtime_service",
       [](std::string address, int num_nodes,
          std::optional<int> heartbeat_interval,
@@ -1175,19 +793,19 @@ static void Init(py::module_& m) {
             xla::ValueOrThrow(GetDistributedRuntimeService(address, options));
         return service;
       },
-      py::arg("address"), py::arg("num_nodes"), py::kw_only(),
-      py::arg("heartbeat_interval") = std::nullopt,
-      py::arg("max_missing_heartbeats") = std::nullopt,
-      py::arg("cluster_register_timeout") = std::nullopt,
-      py::arg("shutdown_timeout") = std::nullopt);
+      nb::arg("address"), nb::arg("num_nodes"),
+      nb::arg("heartbeat_interval").none() = std::nullopt,
+      nb::arg("max_missing_heartbeats").none() = std::nullopt,
+      nb::arg("cluster_register_timeout").none() = std::nullopt,
+      nb::arg("shutdown_timeout").none() = std::nullopt);
 
-  m.def(
+  m_nb.def(
       "get_distributed_runtime_client",
       [](std::string address, int node_id, std::optional<int> rpc_timeout,
          std::optional<int> init_timeout, std::optional<int> shutdown_timeout,
          std::optional<int> heartbeat_interval,
          std::optional<int> max_missing_heartbeats,
-         std::optional<std::function<void(xla::Status,
+         std::optional<std::function<void(absl::Status,
                                           bool coordinator_reported_failure)>>
              missed_heartbeat_callback,
          std::optional<bool> shutdown_on_destruction)
@@ -1218,18 +836,18 @@ static void Init(py::module_& m) {
         }
         return GetDistributedRuntimeClient(address, options);
       },
-      py::arg("address"), py::arg("node_id"), py::kw_only(),
-      py::arg("rpc_timeout") = std::nullopt,
-      py::arg("init_timeout") = std::nullopt,
-      py::arg("shutdown_timeout") = std::nullopt,
-      py::arg("heartbeat_interval") = std::nullopt,
-      py::arg("max_missing_heartbeats") = std::nullopt,
-      py::arg("missed_heartbeat_callback") = std::nullopt,
-      py::arg("shutdown_on_destruction") = std::nullopt);
+      nb::arg("address"), nb::arg("node_id"),
+      nb::arg("rpc_timeout").none() = std::nullopt,
+      nb::arg("init_timeout").none() = std::nullopt,
+      nb::arg("shutdown_timeout").none() = std::nullopt,
+      nb::arg("heartbeat_interval").none() = std::nullopt,
+      nb::arg("max_missing_heartbeats").none() = std::nullopt,
+      nb::arg("missed_heartbeat_callback").none() = std::nullopt,
+      nb::arg("shutdown_on_destruction").none() = std::nullopt);
 
-  m.def("collect_garbage", []() { GlobalPyRefManager()->CollectGarbage(); });
+  m_nb.def("collect_garbage", []() { GlobalPyRefManager()->CollectGarbage(); });
 
-  m.def("is_optimized_build", &IsOptimizedBuild);
+  m_nb.def("is_optimized_build", &IsOptimizedBuild);
 
   m_nb.def("json_to_pprof_profile",
            xla::ValueOrThrowWrapper(JsonToPprofProfile),
@@ -1240,140 +858,83 @@ static void Init(py::module_& m) {
            "Decodes an uncompressed pprof Profile protocol buffer into a JSON "
            "representation");
 
-  RegisterCompileOnlyClient(m);
-  py::class_<PjRtTopologyDescription, std::shared_ptr<PjRtTopologyDescription>>(
-      m, "DeviceTopology")
+  RegisterCompileOnlyClient(m_nb);
+  nb::class_<PjRtTopologyDescription>(m_nb, "DeviceTopology")
       .def("_make_compile_only_devices",
            [](std::shared_ptr<PjRtTopologyDescription> topology) {
              return MakeCompileOnlyClient(topology)->Devices();
            })
-      .def_property_readonly("platform",
-                             [](PjRtTopologyDescription& topology) {
-                               return topology.platform_name();
-                             })
-      .def_property_readonly("platform_version",
-                             [](PjRtTopologyDescription& topology) {
-                               return topology.platform_version();
-                             })
+      .def_prop_ro("platform",
+                   [](PjRtTopologyDescription& topology) {
+                     return topology.platform_name();
+                   })
+      .def_prop_ro("platform_version",
+                   [](PjRtTopologyDescription& topology) {
+                     return topology.platform_version();
+                   })
       .def("serialize",
-           [](PjRtTopologyDescription& topology) -> py::bytes {
-             return py::bytes(ValueOrThrow(topology.Serialize()));
+           [](PjRtTopologyDescription& topology) -> nb::bytes {
+             std::string serialized = ValueOrThrow(topology.Serialize());
+             return nb::bytes(serialized.data(), serialized.size());
            })
-      .def(
-          "__getattr__",
-          [](PjRtTopologyDescription& topology,
-             std::string name) -> py::object {
-            const auto& attrs = topology.Attributes();
-            auto it = attrs.find(name);
-            if (it != attrs.end()) {
-              return std::visit([](auto&& v) { return py::cast(v); },
-                                it->second);
-            }
-            throw py::attribute_error(absl::StrCat("Unknown attribute ", name));
-          });
+      .def("__getattr__",
+           [](PjRtTopologyDescription& topology,
+              std::string_view name) -> nb::object {
+             const auto& attrs = topology.Attributes();
+             auto it = attrs.find(name);
+             if (it != attrs.end()) {
+               return std::visit([](auto&& v) { return nb::cast(v); },
+                                 it->second);
+             }
+             throw nb::attribute_error(
+                 absl::StrCat("Unknown attribute ", name).c_str());
+           });
 
-  py::class_<PjRtExecutable, std::shared_ptr<PjRtExecutable>>(m, "Executable")
-      .def("hlo_modules",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PyLoadedExecutable::GetParameterLayouts when nanobind transition
-             // is complete.
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetHloModules())).release().ptr());
-           })
+  nb::class_<PjRtExecutable>(m_nb, "Executable")
+      .def("hlo_modules", ValueOrThrowWrapper(&PjRtExecutable::GetHloModules))
       .def("get_output_memory_kinds",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputMemoryKinds))
-      .def("get_output_shardings",
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_borrow<py::object>(
-                 nb::cast(self.GetOutputShardings()).release().ptr());
-           })
+      .def("get_output_shardings", &PjRtExecutable::GetOutputShardings)
       .def("get_parameter_layouts",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetParameterLayouts when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetParameterLayouts()))
-                     .release()
-                     .ptr());
-           })
+           ValueOrThrowWrapper(&PjRtExecutable::GetParameterLayouts))
       .def("get_output_layouts",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetOutputLayouts when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetOutputLayouts()))
-                     .release()
-                     .ptr());
-           })
-      .def("get_parameter_shardings",
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_borrow<py::object>(
-                 nb::cast(self.GetParameterShardings()).release().ptr());
-           })
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetOutputLayouts))
+      .def("get_parameter_shardings", &PjRtExecutable::GetParameterShardings)
       .def("get_compiled_memory_stats",
-           [](const PjRtExecutable& self) {
-             // TODO(phawkins): revert to a direct wrapping of
-             // PjRtExecutable::GetCompiledMemoryStats when nanobind transition
-             // is complete.
-             // xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats)
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetCompiledMemoryStats()))
-                     .release()
-                     .ptr());
-           })
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompiledMemoryStats))
       .def("compile_options",
-           // TODO(phawkins): revert to the following when nanobind transition
-           // complete
-           // xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
-           [](const PjRtExecutable& self) {
-             return py::reinterpret_steal<py::object>(
-                 nb::cast(ValueOrThrow(self.GetCompileOptions()))
-                     .release()
-                     .ptr());
-           })
+           xla::ValueOrThrowWrapper(&PjRtExecutable::GetCompileOptions))
       .def("serialize",
-           [](const PjRtExecutable& exec) -> py::bytes {
-             return ValueOrThrow(exec.SerializeExecutable());
+           [](const PjRtExecutable& exec) -> nb::bytes {
+             std::string serialized = ValueOrThrow(exec.SerializeExecutable());
+             return nb::bytes(serialized.data(), serialized.size());
            })
       .def("cost_analysis",
            xla::ValueOrThrowWrapper(&PjRtExecutable::GetCostAnalysis));
 
-  m.def("is_asan", IsAsan);
-  m.def("is_msan", IsMsan);
-  m.def("is_tsan", IsTsan);
-  m.def("is_sanitized", IsSanitized);
+  m_nb.def("is_asan", IsAsan);
+  m_nb.def("is_msan", IsMsan);
+  m_nb.def("is_tsan", IsTsan);
+  m_nb.def("is_sanitized", IsSanitized);
 
-  m.def(
+  m_nb.def(
       "batched_device_put",
-      [](py::object aval, py::object sharding, py::object xs_py,
-         std::vector<ClientAndPtr<PjRtDevice>> dst_devices, bool committed,
+      [](nb::object aval, nb::object sharding, std::vector<nb::object> xs,
+         std::vector<const PyDevice*> dst_devices, bool committed,
          bool force_copy,
-         PjRtClient::HostBufferSemantics host_buffer_semantics) -> py::object {
-        // TODO(phawkins): simplify after nanobind transition is complete.
-        auto xs = nb::cast<std::vector<nb::object>>(nb::handle(xs_py.ptr()));
-        return py::reinterpret_steal<py::object>(
-            ValueOrThrow(PyArray::BatchedDevicePut(
-                             nb::borrow(aval.ptr()), nb::borrow(sharding.ptr()),
-                             std::move(xs), std::move(dst_devices), committed,
-                             force_copy, host_buffer_semantics,
-                             jax::GetEnableX64()))
-                .release()
-                .ptr());
+         PjRtClient::HostBufferSemantics host_buffer_semantics) -> nb::object {
+        return ValueOrThrow(PyArray::BatchedDevicePut(
+            nb::borrow(aval.ptr()), nb::borrow(sharding.ptr()), std::move(xs),
+            std::move(dst_devices), committed, force_copy,
+            host_buffer_semantics, jax::GetEnableX64()));
       },
-      py::arg("aval"), py::arg("sharding"), py::arg("xs"), py::arg("devices"),
-      py::arg("committed") = true, py::arg("force_copy") = false,
-      py::arg("host_buffer_semantics") =
+      nb::arg("aval"), nb::arg("sharding"), nb::arg("xs"), nb::arg("devices"),
+      nb::arg("committed") = true, nb::arg("force_copy") = false,
+      nb::arg("host_buffer_semantics") =
           PjRtClient::HostBufferSemantics::kZeroCopy);
 
-  m.def("batched_block_until_ready", [](py::object xs_py) {
-    // TODO(phawkins): simplify after nanobind transition is complete.
-    auto xs = nb::cast<std::vector<nb::object>>(nb::handle(xs_py.ptr()));
-    xla::ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
+  m_nb.def("batched_block_until_ready", [](std::vector<nb::object> xs) {
+    ThrowIfError(PyArray::BatchedBlockUntilReady(std::move(xs)));
   });
 
   m_nb.def("check_and_canonicalize_memory_kind",
@@ -1381,24 +942,4 @@ static void Init(py::module_& m) {
            nb::arg("device_list"));
 }  // NOLINT(readability/fn_size)
 
-// This code in essence is a copy of PYBIND11_MODULE(). We can't just call
-// PYBIND11_MODULE because we want the entry point of the module to be in
-// the py_extension() translation unit but we don't want anything else to be
-// defined there. Inside Google, py_extension() translation units are linked
-// differently and they end up with a different instance of the
-// py::module_local() state, breaking that feature of pybind11.
-static py::module_::module_def xla_module_def;
-
-PyObject* InitializeXlaExtension() {
-  PYBIND11_CHECK_PYTHON_VERSION
-  PYBIND11_ENSURE_INTERNALS_READY
-  auto m = py::module_::create_extension_module("xla_extension", nullptr,
-                                                &xla_module_def);
-  try {
-    Init(m);
-    return m.ptr();
-  }
-  PYBIND11_CATCH_INIT_EXCEPTIONS
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index d8b24aba09dcb5..a6b8b6e8dc4e58 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,7 +48,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 246
+_version = 252
 
 # Version number for MLIR:Python components.
 mlir_api_version = 55
@@ -543,6 +543,7 @@ def window_padding_type_to_pad_values(
 SingleDeviceSharding = _xla.SingleDeviceSharding
 PmapSharding = _xla.PmapSharding
 GSPMDSharding = _xla.GSPMDSharding
+PjRtLayout = _xla.PjRtLayout
 
 
 def LoadedExecutable_execute(self, arguments, device=None):
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index cedaa61db5cb96..cd8f5bd09615ee 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -42,6 +42,7 @@ from .xla_extension import OpSharding as OpSharding
 from .xla_extension import HloSharding as HloSharding
 from .xla_extension import PrimitiveType as PrimitiveType
 from .xla_extension import Traceback as Traceback
+from .xla_extension import PjRtLayout as PjRtLayout
 from .xla_extension import XlaBuilder as XlaBuilder
 from .xla_extension import XlaComputation as XlaComputation
 from .xla_extension import XlaOp as XlaOp
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index 9814c487802303..e0f131334c2f3a 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -533,6 +533,12 @@ def testScalarMinusVectorExplicitNumbering(self, dtype):
   class LayoutsTest(ComputationTest):
     """Tests related to getting and setting on-device memory layouts."""
 
+    def _minor_to_major(self, layout: xla_client.PjRtLayout):  # pylint: disable=invalid-name
+      m2m_str = re.search("{([0-9,]*)", str(layout)).group(1)
+      if not m2m_str:
+        return ()
+      return tuple(int(x) for x in m2m_str.split(","))
+
     @unittest.skipIf(pathways, "not implemented")
     def testGetArgumentLayouts(self):
       # Create computation with a few parameters.
@@ -557,9 +563,9 @@ def MakeArg(shape, dtype):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 3)
-      self.assertLen(layouts[1].minor_to_major(), 2)
-      self.assertEmpty(layouts[2].minor_to_major())
+      self.assertLen(self._minor_to_major(layouts[0]), 3)
+      self.assertLen(self._minor_to_major(layouts[1]), 2)
+      self.assertEmpty(self._minor_to_major(layouts[2]))
 
     @unittest.skipIf(pathways, "not implemented")
     def testGetArgumentLayoutsTupled(self):
@@ -590,9 +596,9 @@ def testGetArgumentLayoutsTupled(self):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_parameter_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 3)
-      self.assertEmpty(layouts[1].minor_to_major())
-      self.assertLen(layouts[2].minor_to_major(), 1)
+      self.assertLen(self._minor_to_major(layouts[0]), 3)
+      self.assertEmpty(self._minor_to_major(layouts[1]))
+      self.assertLen(self._minor_to_major(layouts[2]), 1)
 
     @unittest.skipIf(pathways, "not implemented")
     def testGetOutputLayouts(self):
@@ -616,9 +622,9 @@ def testGetOutputLayouts(self):
       # Test that compiled executable returns plausible layouts.
       layouts: Sequence[xla_client.Layout] = executable.get_output_layouts()
       self.assertLen(layouts, 3)
-      self.assertLen(layouts[0].minor_to_major(), 2)
-      self.assertEmpty(layouts[1].minor_to_major())
-      self.assertLen(layouts[2].minor_to_major(), 1)
+      self.assertLen(self._minor_to_major(layouts[0]), 2)
+      self.assertEmpty(self._minor_to_major(layouts[1]))
+      self.assertLen(self._minor_to_major(layouts[2]), 1)
 
     @unittest.skipIf(pathways, "not implemented")
     def testSetArgumentLayouts(self):
@@ -652,9 +658,9 @@ def testSetArgumentLayouts(self):
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
       self.assertLen(input_layouts, 3)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1, 2))
-      self.assertEqual(input_layouts[1].minor_to_major(), ())
-      self.assertEqual(input_layouts[2].minor_to_major(), (0,))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1, 2))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
+      self.assertEqual(self._minor_to_major(input_layouts[2]), (0,))
 
       # Compile a version with default arg0 layout so we can make sure we
       # actually set it above.
@@ -662,8 +668,9 @@ def testSetArgumentLayouts(self):
           module_str.replace('"{0,1,2}"', '"default"')
       )
       self.assertNotEqual(
-          input_layouts[0].minor_to_major(),
-          default_executable.get_parameter_layouts()[0].minor_to_major())
+          self._minor_to_major(input_layouts[0]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways or pathways_ifrt, "not implemented")
     def testSetArgumentLayoutsLegacy(self):
@@ -706,8 +713,10 @@ def MakeArg(shape, dtype, layout):
           executable.get_parameter_layouts())
       self.assertEqual(len(actual_layouts), len(expected_layouts))
       for actual, expected in zip(actual_layouts, expected_layouts):
-        self.assertEqual(actual.minor_to_major(),
-                         expected.layout().minor_to_major())
+        self.assertEqual(
+            self._minor_to_major(actual),
+            expected.layout().minor_to_major(),
+        )
 
     @unittest.skipIf(pathways, "not implemented")
     def testSetOutputLayouts(self):
@@ -741,9 +750,9 @@ def testSetOutputLayouts(self):
       # Check output layouts.
       output_layouts = executable.get_output_layouts()
       self.assertLen(output_layouts, 3)
-      self.assertEqual(output_layouts[0].minor_to_major(), (0, 1, 2))
-      self.assertEqual(output_layouts[1].minor_to_major(), ())
-      self.assertEqual(output_layouts[2].minor_to_major(), (0,))
+      self.assertEqual(self._minor_to_major(output_layouts[0]), (0, 1, 2))
+      self.assertEqual(self._minor_to_major(output_layouts[1]), ())
+      self.assertEqual(self._minor_to_major(output_layouts[2]), (0,))
 
       # Compile a version with default first output layout so we can make sure
       # we actually set it above.
@@ -751,8 +760,9 @@ def testSetOutputLayouts(self):
           module_str.replace('"{0,1,2}"', '"default"')
       )
       self.assertNotEqual(
-          output_layouts[0].minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major())
+          self._minor_to_major(output_layouts[0]),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways, "not implemented")
     def SetLayoutsSharded(self):
@@ -788,13 +798,13 @@ def SetLayoutsSharded(self):
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
       self.assertLen(input_layouts, 2)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
-      self.assertEqual(input_layouts[1].minor_to_major(), ())
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), ())
 
       # Check output layout.
       output_layouts = executable.get_output_layouts()
       self.assertLen(output_layouts, 1)
-      self.assertEqual(input_layouts[0].minor_to_major(), (0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (0, 1))
 
       # Compile a version with default layouts so we can make sure we actually
       # set it above.
@@ -802,11 +812,13 @@ def SetLayoutsSharded(self):
           module_str.replace('"{0,1}"', '"default"')
       )
       self.assertNotEqual(
-          input_layouts[0].minor_to_major(),
-          default_executable.get_parameter_layouts()[0].minor_to_major())
+          self._minor_to_major(input_layouts[0]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[0]),
+      )
       self.assertNotEqual(
-          output_layouts[0].minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major())
+          self._minor_to_major(output_layouts[0]),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
+      )
 
     @unittest.skipIf(pathways, "not implemented")
     def testAutoArgumentLayouts(self):
@@ -838,8 +850,8 @@ def testAutoArgumentLayouts(self):
 
       # Check input layouts.
       input_layouts = executable.get_parameter_layouts()
-      self.assertEqual(input_layouts[0].minor_to_major(), (1, 0))
-      self.assertEqual(input_layouts[1].minor_to_major(), (2, 0, 1))
+      self.assertEqual(self._minor_to_major(input_layouts[0]), (1, 0))
+      self.assertEqual(self._minor_to_major(input_layouts[1]), (2, 0, 1))
 
       # Compile a version with default layouts so we can make sure the compiler
       # is actually choosing above.
@@ -849,8 +861,8 @@ def testAutoArgumentLayouts(self):
       # We expect the compiler to choose a non-default layout for the second
       # (1024,8,128) argument.
       self.assertNotEqual(
-          input_layouts[1].minor_to_major(),
-          default_executable.get_parameter_layouts()[1].minor_to_major(),
+          self._minor_to_major(input_layouts[1]),
+          self._minor_to_major(default_executable.get_parameter_layouts()[1]),
       )
 
     @unittest.skipIf(pathways, "not implemented")
@@ -881,7 +893,7 @@ def testAutoOutputLayouts(self):
 
       # Check output layout
       output_layout, = executable.get_output_layouts()
-      self.assertEqual(output_layout.minor_to_major(), (2, 0, 1))
+      self.assertEqual(self._minor_to_major(output_layout), (2, 0, 1))
 
       # Compile a version with default layouts so we can make sure the compiler
       # is actually choosing above.
@@ -890,8 +902,8 @@ def testAutoOutputLayouts(self):
       )
       # We expect the compiler to choose a non-default output layout.
       self.assertNotEqual(
-          output_layout.minor_to_major(),
-          default_executable.get_output_layouts()[0].minor_to_major(),
+          self._minor_to_major(output_layout),
+          self._minor_to_major(default_executable.get_output_layouts()[0]),
       )
 
   tests.append(LayoutsTest)
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index 69bb526a05683d..b5ef8148f142bc 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "third_party/nanobind/include/nanobind/stl/string_view.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/variant.h"  // IWYU pragma: keep
 #include "third_party/nanobind/include/nanobind/stl/vector.h"  // IWYU pragma: keep
-#include "pybind11/pybind11.h"  // from @pybind11
 #include "xla/array.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/client/xla_builder.h"
@@ -715,9 +714,8 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
         });
   m.def(
       "hlo_module_cost_analysis",
-      xla::ValueOrThrowWrapper([](nb::handle client_py, const HloModule& module)
+      xla::ValueOrThrowWrapper([](PyClient* client, const HloModule& module)
                                    -> StatusOr<nb::dict> {
-        PyClient* client = pybind11::cast<PyClient*>(client_py.ptr());
         TF_ASSIGN_OR_RETURN(auto analysis,
                             client->pjrt_client()->GetHloCostAnalysis());
         TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
@@ -943,10 +941,10 @@ void BuildXlaCompilerSubmodule(nb::module_& m) {
           targets[nb::str(name.data(), name.size())] = nb::capsule(target);
         }
 
-        for (const auto& [name, target] :
+        for (const auto& [name, registration] :
              ffi::StaticRegisteredHandlers(platform)) {
           targets[nb::str(name.data(), name.size())] =
-              nb::capsule(reinterpret_cast<void*>(target));
+              nb::capsule(reinterpret_cast<void*>(registration.handler));
         }
         return targets;
       },
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 294a62a9136cfb..b8a02ae8f1f41e 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -37,6 +37,7 @@ from typing import (
 
 import numpy as np
 
+from . import ifrt_proxy
 from . import jax_jit
 from . import mlir
 from . import ops
@@ -266,6 +267,7 @@ def register_custom_call_partitioner(
     partition: Callable,
     infer_sharding_from_operands: Callable,
     can_side_effecting_have_replicated_sharding: bool,
+    c_api: Optional[Any],
 ) -> None: ...
 def encode_inspect_sharding_callback(handler: Any) -> bytes: ...
 
diff --git a/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi b/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
new file mode 100644
index 00000000000000..f65685025e5166
--- /dev/null
+++ b/third_party/xla/xla/python/xla_extension/ifrt_proxy.pyi
@@ -0,0 +1,32 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Any, Optional, Callable
+
+from xla.python import xla_extension
+
+_Status = Any
+Client = xla_extension.Client
+
+
+class ClientConnectionOptions:
+  on_disconnect: Optional[Callable[[_Status], None]] = None
+  on_connection_update: Optional[Callable[[str], None]] = None
+
+
+def get_client(
+    proxy_server_address: str,
+    options: ClientConnectionOptions
+) -> Client: ...
diff --git a/third_party/xla/xla/python/xla_extension/ops.pyi b/third_party/xla/xla/python/xla_extension/ops.pyi
index 7fcac04eddbf5f..55624f47446a60 100644
--- a/third_party/xla/xla/python/xla_extension/ops.pyi
+++ b/third_party/xla/xla/python/xla_extension/ops.pyi
@@ -59,6 +59,7 @@ class CustomCallApiVersion(enum.IntEnum):
   API_VERSION_ORIGINAL: int
   API_VERSION_STATUS_RETURNING: int
   API_VERSION_STATUS_RETURNING_UNIFIED: int
+  API_VERSION_TYPED_FFI: int
 
 def AfterAll(builder: XlaBuilder, tokens: Sequence[XlaOp]) -> XlaOp: ...
 def AllGather(
diff --git a/third_party/xla/xla/python/xla_extension/profiler.pyi b/third_party/xla/xla/python/xla_extension/profiler.pyi
index e606f3b4416f6d..92dbb02639b7f3 100644
--- a/third_party/xla/xla/python/xla_extension/profiler.pyi
+++ b/third_party/xla/xla/python/xla_extension/profiler.pyi
@@ -14,7 +14,7 @@
 # ==============================================================================
 
 from types import TracebackType
-from typing import Any, Optional, Type
+from typing import Any, Optional, Type, Union
 
 _Status = Any
 
@@ -24,7 +24,9 @@ def start_server(port: int) -> ProfilerServer: ...
 def register_plugin_profiler(c_api: Any) -> None: ...
 
 def get_profiled_instructions_proto(tensorboard_dir: str) -> bytes: ...
-def get_fdo_profile(xspace: bytes) -> bytes: ...
+def get_fdo_profile(
+    xspace: bytes, as_textproto: bool = ...
+) -> Union[bytes, str]: ...
 
 class ProfilerSession:
   def __init__(self, options: Optional[ProfileOptions] = ...) -> None: ...
diff --git a/third_party/xla/xla/python/xla_extension/pytree.pyi b/third_party/xla/xla/python/xla_extension/pytree.pyi
index 24421a857ca8dd..e493fda1dfde55 100644
--- a/third_party/xla/xla/python/xla_extension/pytree.pyi
+++ b/third_party/xla/xla/python/xla_extension/pytree.pyi
@@ -29,6 +29,9 @@ class PyTreeRegistry:
       tree: Any,
       leaf_predicate: Optional[Callable[[Any], bool]] = ...,
   ) -> Tuple[List[Any], PyTreeDef]: ...
+  def flatten_one_level(
+      self, tree: Any
+  ) -> Optional[Tuple[Iterable[Any], Any]]: ...
   def register_node(
       self,
       __type: Type[_T],
@@ -74,6 +77,5 @@ class PyTreeDef:
   ) -> PyTreeDef:
     ...
 
-
 _Children = TypeVar("_Children", bound=Iterable[Any])
 _AuxData = TypeVar("_AuxData", bound=Hashable)
diff --git a/third_party/xla/xla/pytype.default.bzl b/third_party/xla/xla/pytype.default.bzl
index 05143e8a715181..b63011cc1b8e48 100644
--- a/third_party/xla/xla/pytype.default.bzl
+++ b/third_party/xla/xla/pytype.default.bzl
@@ -10,5 +10,6 @@ def pytype_strict_binary(name, **kwargs):
     native.py_binary(name = name, **kwargs)
 
 # Placeholder to use until bazel supports pytype_strict_library.
-def pytype_strict_library(name, **kwargs):
+def pytype_strict_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    _ = (pytype_deps, pytype_srcs)  # @unused
     native.py_library(name = name, **kwargs)
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index 647fddb8c87b4f..45f2217fcf09b1 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -1,4 +1,3 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_platform_deps")
@@ -11,6 +10,7 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 1ce30da506c7f3..e9c73e6fd54c35 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1,26 +1,14 @@
 # Description:
 #   XLA service implementation.
 
-load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_binary",
-    "xla_cc_test",
-    "xla_py_proto_library",
-    "xla_py_test_deps",
-    "xla_symbol_repository_deps",
-)
-load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility")
+load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -32,6 +20,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
+    "xla_py_test_deps",
+    "xla_symbol_repository_deps",
+)
+load("//xla/service:xla_compile.bzl", "xla_aot_compile_cpu", "xla_aot_compile_gpu", "xla_aot_compile_gpu_runtime_autotuning")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -79,6 +79,7 @@ tf_proto_library(
     name = "metrics_proto",
     srcs = ["metrics.proto"],
     cc_api_version = 2,
+    visibility = ["//visibility:public"],
 )
 
 xla_py_proto_library(
@@ -381,6 +382,7 @@ cc_library(
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
@@ -594,7 +596,6 @@ xla_cc_test(
         ":hlo_parser",
         ":shape_inference",
         "//xla:shape_util",
-        "//xla:status",
         "//xla:statusor",
         "//xla:test",
         "//xla:test_helpers",
@@ -607,7 +608,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -2089,12 +2090,15 @@ cc_library(
     srcs = ["op_expander_pass.cc"],
     hdrs = ["op_expander_pass.h"],
     deps = [
-        ":hlo_creation_utils",
         ":hlo_pass",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3183,11 +3187,11 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -3266,14 +3270,21 @@ cc_library(
         "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:status_macros",
         "//xla:statusor",
         "//xla:union_find",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -3290,10 +3301,12 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -3310,6 +3323,9 @@ cc_library(
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -3317,14 +3333,12 @@ xla_cc_test(
     name = "while_loop_trip_count_annotator_test",
     srcs = ["while_loop_trip_count_annotator_test.cc"],
     deps = [
-        ":pattern_matcher",
-        ":while_loop_simplifier",
         ":while_loop_trip_count_annotator",
-        "//xla:status_macros",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4571,7 +4585,6 @@ cc_library(
     deps = [
         ":call_graph",
         ":computation_layout",
-        ":hlo_alias_analysis",
         ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
@@ -4581,6 +4594,7 @@ cc_library(
         "//xla:permutation_util",
         "//xla:shape_layout",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -4598,8 +4612,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -4773,6 +4787,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -5039,6 +5054,7 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -5060,6 +5076,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:literal_test_util",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -5500,12 +5517,12 @@ cc_library(
         ":hlo_pass",
         "//xla:literal",
         "//xla:shape_util",
-        "//xla:status_macros",
+        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -5513,18 +5530,16 @@ xla_cc_test(
     name = "zero_sized_hlo_elimination_test",
     srcs = ["zero_sized_hlo_elimination_test.cc"],
     deps = [
-        ":shape_inference",
         ":zero_sized_hlo_elimination",
-        "//xla:literal",
+        "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:test",
-        "//xla:test_helpers",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -5866,17 +5881,13 @@ cc_library(
     srcs = ["host_offload_legalize.cc"],
     hdrs = ["host_offload_legalize.h"],
     deps = [
+        ":call_graph",
         ":hlo_alias_analysis",
-        ":hlo_buffer",
         ":hlo_pass",
         ":hlo_value",
         ":host_memory_offload_annotations_hdr",
-        ":host_offloader",
-        ":pattern_matcher",
-        "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -5884,7 +5895,9 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -5927,14 +5940,13 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -5960,7 +5972,9 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -5998,13 +6012,16 @@ xla_cc_test(
     srcs = ["while_util_test.cc"],
     deps = [
         ":while_util",
+        "//xla:statusor",
         "//xla:test",
         "//xla:util",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6100,7 +6117,6 @@ cc_library(
     deps = [
         ":hlo_dce",
         ":hlo_pass",
-        ":tuple_util",
         ":while_loop_analysis",
         ":while_util",
         "//xla:shape_util",
@@ -6111,7 +6127,11 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6121,11 +6141,17 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":while_loop_invariant_code_motion",
+        "//xla:literal_util",
+        "//xla:shape_util",
         "//xla:test",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6201,6 +6227,7 @@ cc_library(
     deps = [
         ":hlo_pass",
         ":while_util",
+        "//xla:shape_util",
         "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -6249,7 +6276,7 @@ xla_cc_test(
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6990,7 +7017,10 @@ cc_library(
         ":hlo_proto_cc",
         ":hlo_proto_util",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:status",
     ],
 )
@@ -6999,11 +7029,15 @@ xla_cc_test(
     name = "xla_debug_info_manager_test",
     srcs = ["xla_debug_info_manager_test.cc"],
     deps = [
+        ":hlo_module_config",
         ":hlo_proto_cc",
         ":xla_debug_info_manager",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -7272,9 +7306,11 @@ cc_library(
     name = "change_op_data_type",
     srcs = ["change_op_data_type.cc"],
     hdrs = ["change_op_data_type.h"],
+    copts = tsl_copts(),
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
+        "//xla/service/cpu:onednn_matmul_rewriter",
     ],
 )
 
@@ -7421,12 +7457,12 @@ xla_cc_binary(
         ":cpu_plugin",
         "//xla:status",
         "//xla/tools:xla_compile_lib",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_is_configured([
         "//xla/service/gpu:executable_proto_cc",
         "//xla/service/gpu:gpu_compiler",
@@ -7504,11 +7540,15 @@ xla_cc_test(
     deps = [
         ":cpu_plugin",
         ":platform_util",
+        ":shaped_buffer",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/service/cpu:cpu_compiler",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
@@ -7525,9 +7565,13 @@ xla_cc_test(
     deps = [
         ":cpu_plugin",
         ":platform_util",
+        ":shaped_buffer",
+        "//xla:error_spec",
         "//xla:executable_run_options",
+        "//xla:literal",
         "//xla:literal_util",
         "//xla/client:client_library",
+        "//xla/client:executable_build_options",
         "//xla/client:local_client",
         "//xla/service/cpu:cpu_compiler",
         "//xla/tests:literal_test_util",
@@ -7608,10 +7652,10 @@ cc_library(
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 49a021487ce46b..c38802d03b5b4e 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -8082,7 +8082,8 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   // Replace reshape of a transpose of a reshape with concatenated slicing if
   // the reshape/transpose combination can be interpreted as a space-to-depth
   // transformation.
-  if (operand->opcode() == HloOpcode::kReshape &&
+  if (!options_.is_layout_sensitive() &&
+      operand->opcode() == HloOpcode::kReshape &&
       transpose->user_count() == 1 &&
       HloOpcode::kReshape == transpose->users()[0]->opcode()) {
     VLOG(2) << "trying depth-to-space transform";
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 35d62966805a5b..e08a4b7fa7847e 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -3100,7 +3100,6 @@ TEST_F(AlgebraicSimplifierTest, DoNotRemoveUnaryConcatenateWithCtrlDep) {
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Concatenate(m::Parameter(0))));
-  LOG(ERROR) << "module: " << m->ToString();
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).value());
@@ -5663,7 +5662,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeToConcatSlice) {
 HloModule TransposeReshapeDepthToSpace
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,2,64] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,1,3,2,4}
   ROOT %reshape.2 = f32[8,28,14,64] reshape(%transpose)
@@ -5690,7 +5689,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeTooLarge) {
 HloModule TransposeReshapeDepthToSpaceBig
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,8,16] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,1,3,2,4}
   ROOT %reshape.2 = f32[8,112,14,16] reshape(%transpose)
@@ -5710,7 +5709,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeReshapeNotDepthToSpace) {
 HloModule TransposeReshapeDepthToSpace
 
 ENTRY entry {
-  %param = f32[8,14,14,128]{0,1,2,3} parameter(0)
+  %param = f32[8,14,14,128] parameter(0)
   %reshape.1 = f32[8,14,14,2,64] reshape(%param)
   %transpose = transpose(%reshape.1), dimensions={0,3,1,2,4}
   ROOT %reshape.2 = f32[8,28,14,64] reshape(%transpose)
@@ -6434,6 +6433,11 @@ TEST_F(AlgebraicSimplifierTest, TransposeOfNonCanonicalBatchDotCantSimplify) {
 }
 
 TEST_F(AlgebraicSimplifierTest, DynamicSliceOfTranspose) {
+  // This test is without layouts so we have to set the verifier to be layout
+  // insensitive.
+  verifier_layout_sensitive_ = false;
+  instruction_can_change_layout_func_ = {};
+
   const char* hlo_string = R"(
     HloModule module
 
@@ -7959,6 +7963,7 @@ TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
   // This test is without layouts so we have to set the verifier to be layout
   // insensitive.
   verifier_layout_sensitive_ = false;
+  instruction_can_change_layout_func_ = {};
 
   Shape shape = ShapeUtil::MakeShape(F32, {});
   shape.clear_layout();
diff --git a/third_party/xla/xla/service/backend.cc b/third_party/xla/xla/service/backend.cc
index 459b0d37c0d521..aac138f395caf9 100644
--- a/third_party/xla/xla/service/backend.cc
+++ b/third_party/xla/xla/service/backend.cc
@@ -114,9 +114,9 @@ absl::StatusOr<StreamPool::Ptr> Backend::BorrowStream(
     se::StreamExecutor* executor, se::StreamPriority priority) {
   absl::MutexLock l(&mu_);
   if (!stream_pools_.contains(executor)) {
-    stream_pools_.emplace(executor, std::make_unique<StreamPool>());
+    stream_pools_.emplace(executor, std::make_unique<StreamPool>(executor));
   }
-  return stream_pools_.at(executor)->BorrowStream(executor, priority);
+  return stream_pools_.at(executor)->BorrowStream(priority);
 }
 
 absl::StatusOr<std::vector<StreamPool::Ptr>> Backend::BorrowStreams(
@@ -124,13 +124,12 @@ absl::StatusOr<std::vector<StreamPool::Ptr>> Backend::BorrowStreams(
   absl::MutexLock l(&mu_);
   TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal));
   if (!stream_pools_.contains(executor)) {
-    stream_pools_.emplace(executor, std::make_unique<StreamPool>());
+    stream_pools_.emplace(executor, std::make_unique<StreamPool>(executor));
   }
 
   std::vector<StreamPool::Ptr> ptrs;
   for (int i = 0; i < num_streams; i++) {
-    StreamPool::Ptr ptr =
-        stream_pools_.at(executor)->BorrowStream(executor, priority);
+    StreamPool::Ptr ptr = stream_pools_.at(executor)->BorrowStream(priority);
     ptrs.push_back(std::move(ptr));
   }
   return ptrs;
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 367151df0528e7..e908b617641f05 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -72,7 +72,6 @@ class BufferAllocation {
 
   BufferAllocation(Index index, int64_t size, LogicalBuffer::Color color)
       : index_(index), size_(size), color_(color) {}
-  ~BufferAllocation() {}
 
   // Returns the index of this allocation.
   Index index() const { return index_; }
diff --git a/third_party/xla/xla/service/change_op_data_type.cc b/third_party/xla/xla/service/change_op_data_type.cc
index 6765ebc7f3c62c..7f06bc76acc4da 100644
--- a/third_party/xla/xla/service/change_op_data_type.cc
+++ b/third_party/xla/xla/service/change_op_data_type.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <optional>
 
 #include "xla/service/hlo_creation_utils.h"
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+#include "xla/service/cpu/onednn_matmul_rewriter.h"
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 namespace xla {
 namespace {
@@ -59,6 +62,12 @@ absl::StatusOr<bool> ChangeOpDataType::Run(
       if (it == to_type_map_.end()) {
         continue;
       }
+#if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
+      if (instr->opcode() == HloOpcode::kDot &&
+          cpu::OneDnnMatMulRewriter::ShouldRewrite(instr)) {
+        continue;
+      }
+#endif  // INTEL_MKL && ENABLE_ONEDNN_V3
       const PrimitiveType to_type = it->second;
       absl::InlinedVector<HloInstruction*, 8> new_operands;
       for (HloInstruction* operand : instr->mutable_operands()) {
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.cc b/third_party/xla/xla/service/collective_permute_decomposer.cc
index f02ae73757e94d..784d3c7318ec25 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/graphcycles/graphcycles.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -71,10 +72,10 @@ bool HasCycles(const SourceTargetPairs& pairs) {
   return false;
 }
 
-// Returns true if the CollectivePermuteStart instruction should be transformed
-// to Send/Recv. We currently limit the transformation to asynchronous
-// CollectivePermuteStart without any cycle in the (source, target)
-// relationship, with only one input and without any context data.
+// Returns true if the CollectivePermute instruction should be transformed
+// to Send/Recv. We currently limit the transformation to CollectivePermute
+// operations without any cycle in their (source, target) relationship,
+// with only one input and without any context data.
 bool ShouldDecompose(const HloCollectivePermuteInstruction& collective_permute,
                      int64_t threshold_in_bytes) {
   // TODO(b/316043789): enable the transformation for the no channel_id case.
@@ -82,25 +83,14 @@ bool ShouldDecompose(const HloCollectivePermuteInstruction& collective_permute,
     return false;
   }
 
-  auto backend_config =
-      collective_permute.backend_config<xla::gpu::GpuBackendConfig>()
-          ->collective_backend_config();
-  if (backend_config.is_sync()) {
-    return false;
-  }
-  if (collective_permute.operand_count() != 1) {
-    return false;
-  }
-
   const Shape& result_shape = collective_permute.shape();
-  // Skip the transformation if there is any context data.
-  if (result_shape.tuple_shapes_size() != 2) {
+  // Skip the transformation if result is not an array, such as containing
+  // context data.
+  if (!result_shape.IsArray()) {
     return false;
   }
 
-  const Shape& shape = result_shape.tuple_shapes(0);
-  CHECK(shape.IsArray());
-  if (ShapeUtil::ByteSizeOf(shape) < threshold_in_bytes) {
+  if (ShapeUtil::ByteSizeOf(result_shape) < threshold_in_bytes) {
     return false;
   }
   return !HasCycles(collective_permute.source_target_pairs());
@@ -116,14 +106,13 @@ bool MayPipeline(const HloCollectivePermuteInstruction& collective_permute) {
 }
 
 // Decomposes a collective-permute and adds frontend attributes to record
-// pipeline decision. The absence of the frontend attribute means the
-// collective-permute will not be pipelined.
+// pipeline decision. The present of the frontend attribute means that the
+// collective-permute will be pipelined and the value of the attribute
+// represents the runtime stream to execute the instruction. Without the
+// frontend attribute, the collective-permute will not be pipelined.
 Status DecomposeCollectivePermute(
     HloCollectivePermuteInstruction* collective_permute,
     HloComputation* computation, const std::string& pipeline_decision) {
-  // The HLO verifier ensures that CollectivePermuteStart's single user is
-  // CollectivePermuteDone.
-  HloInstruction* collective_permute_done = collective_permute->users().front();
   // We currently only decompose collective-permute with a channel_id.
   int64_t channel_id = collective_permute->channel_id().value();
   HloInstruction* data = collective_permute->mutable_operand(0);
@@ -164,13 +153,12 @@ Status DecomposeCollectivePermute(
 
   HloInstruction* recv_done =
       computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  computation->AddInstruction(HloInstruction::CreateSendDone(send));
+  HloInstruction* send_done =
+      computation->AddInstruction(HloInstruction::CreateSendDone(send));
 
   HloInstruction* recv_data = computation->AddInstruction(
       HloInstruction::CreateGetTupleElement(recv_done, 0));
-  TF_RETURN_IF_ERROR(collective_permute_done->ReplaceAllUsesWith(recv_data));
-  TF_RETURN_IF_ERROR(
-      computation->RemoveInstructionAndUnusedOperands(collective_permute_done));
+  TF_RETURN_IF_ERROR(collective_permute->ReplaceAllUsesWith(recv_data));
   TF_RETURN_IF_ERROR(
       computation->RemoveInstructionAndUnusedOperands(collective_permute));
 
@@ -178,7 +166,9 @@ Status DecomposeCollectivePermute(
     xla::FrontendAttributes attributes;
     (*attributes.mutable_map())[kSendRecvPipelineAttr] = pipeline_decision;
     send->add_frontend_attributes(attributes);
+    send_done->add_frontend_attributes(attributes);
     recv->add_frontend_attributes(attributes);
+    recv_done->add_frontend_attributes(attributes);
   }
 
   return OkStatus();
@@ -287,7 +277,7 @@ absl::StatusOr<bool> CollectivePermuteDecomposer::Run(
         while_bodies.insert(hlo->while_body());
         continue;
       }
-      if (hlo->opcode() != HloOpcode::kCollectivePermuteStart) {
+      if (hlo->opcode() != HloOpcode::kCollectivePermute) {
         continue;
       }
 
diff --git a/third_party/xla/xla/service/collective_permute_decomposer.h b/third_party/xla/xla/service/collective_permute_decomposer.h
index 26c3018aba5945..f0d4c0c0df9ae5 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer.h
+++ b/third_party/xla/xla/service/collective_permute_decomposer.h
@@ -21,16 +21,15 @@ limitations under the License.
 
 namespace xla {
 
-// CollectivePermuteDecomposer is a pass that (1) converts asynchronous
-// CollectivePermute operations without any cycle in the (source, target)
-// relationship to Send/Recv, and (2) annotates the Send/Recv for pipelining
-// with a frontend attribute. We currently restrict the decomposition
-// to CollectivePermuteStart with one input and without any context data.
+// CollectivePermuteDecomposer is a pass that (1) converts CollectivePermute
+// operations without any cycle in their (source, target) relationship to
+// Send/Recv, and (2) annotates the Send/Recv for pipelining with a frontend
+// frontend attribute. We currently restrict the decomposition to
+// CollectivePermute with one input and without any context data.
 //
 // before transformation:
-//     start = (<rt>, <rt>) collective-permute-start(data),
+//     cp = (<rt>, <rt>) collective-permute(data),
 //       source_target_pairs={...}
-//     done = <rt> collective-permute-done(start)
 //
 // after transformation:
 //    after-all = token[] after-all()
@@ -42,7 +41,7 @@ namespace xla {
 //    recv-done = (<rt>, token[]) recv-done(recv), channel_id=0
 //    send-done = token[] send-done(send), channel_id=0,
 //      control-predecessors={recv-done}
-//    done = <rt> get-tuple-element(recv-done), index=0
+//    cp = <rt> get-tuple-element(recv-done), index=0
 //
 // For pipelining, we first make pipelining decision on CollectivePermute
 // operations, and then record the decision on the decomposed Send/Recv via
diff --git a/third_party/xla/xla/service/collective_permute_decomposer_test.cc b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
index 97d20a77d3b871..e7a707743109dd 100644
--- a/third_party/xla/xla/service/collective_permute_decomposer_test.cc
+++ b/third_party/xla/xla/service/collective_permute_decomposer_test.cc
@@ -35,33 +35,13 @@ using ::testing::HasSubstr;
 namespace op = xla::testing::opcode_matchers;
 using CollectivePermuteDecomposerTest = HloTestBase;
 
-TEST_F(CollectivePermuteDecomposerTest, SyncNotTransformed) {
-  const absl::string_view kModuleStr = R"(
-      HloModule test
-      ENTRY test_computation {
-        p = u32[] replica-id()
-        start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
-          source_target_pairs={{0,1}, {1,2}},
-          backend_config="{ \"collective_backend_config\": {\"is_sync\":true}}"
-        ROOT done = u32[] collective-permute-done(start)
-      }
-    )";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnUnverifiedModule((kModuleStr)));
-  CollectivePermuteDecomposer decomposer(/*threshold_in_bytes=*/0);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
 TEST_F(CollectivePermuteDecomposerTest, WithCycleNotTransformed) {
   const absl::string_view kModuleStr = R"(
       HloModule test
       ENTRY test_computation {
-        p = (u32[], u32[]) replica-id()
-        start = u32[] collective-permute-start(p), channel_id=1,
+        p = u32[] replica-id()
+        ROOT cp = u32[] collective-permute(p), channel_id=1,
           source_target_pairs={{0,1}, {1,0}}
-        ROOT done = u32[] collective-permute-done(start)
       }
     )";
 
@@ -77,9 +57,8 @@ TEST_F(CollectivePermuteDecomposerTest, WithContextDataNotTransformed) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[], u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = (u32[], u32[], u32[], u32[]) collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -95,10 +74,9 @@ TEST_F(CollectivePermuteDecomposerTest, TransformedExplicitChannelId) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = u32[] collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -154,9 +132,8 @@ TEST_F(CollectivePermuteDecomposerTest, NotTransformedDefaultChannelId) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p),
+    ROOT cp = u32[] collective-permute(p),
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -172,10 +149,9 @@ TEST_F(CollectivePermuteDecomposerTest, ThresholdNotTransformed) {
   HloModule test
   ENTRY test_computation {
     p = u32[] replica-id()
-    start = (u32[], u32[]) collective-permute-start(p), channel_id=1,
+    ROOT cp = u32[] collective-permute(p), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
-    ROOT done = u32[] collective-permute-done(start)
   }
   )";
 
@@ -201,10 +177,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{0,1}, {1,2}, {2,3}, {3,4}},
       frontend_attributes={_xla_other_attribute="xyz"}
-    recv-data = u32[2] collective-permute-done(start)
 
     c1 = u32[] constant(1)
     new_count = u32[] add(count, c1)
@@ -239,6 +214,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
           "_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3},{3,4}}\""));
   EXPECT_THAT(recv->ToString(), HasSubstr("_xla_send_recv_pipeline=\"0\""));
   EXPECT_THAT(recv->ToString(), HasSubstr("_xla_other_attribute=\"xyz\""));
+  HloInstruction* recv_done = FindInstruction(module.get(), "recv-done");
+  EXPECT_THAT(recv_done->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"0\""));
 
   HloInstruction* send = FindInstruction(module.get(), "send");
   EXPECT_EQ(send->channel_id().value(), 1);
@@ -248,6 +226,9 @@ TEST_F(CollectivePermuteDecomposerTest, Pipeline1) {
           "_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3},{3,4}}\""));
   EXPECT_THAT(send->ToString(), HasSubstr("_xla_send_recv_pipeline=\"0\""));
   EXPECT_THAT(send->ToString(), HasSubstr("_xla_other_attribute=\"xyz\""));
+  HloInstruction* send_done = FindInstruction(module.get(), "send-done");
+  EXPECT_THAT(send_done->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"0\""));
 }
 
 TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
@@ -265,13 +246,11 @@ TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start.0 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data.0 = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{3,0}}
-    recv-data.0 = u32[2] collective-permute-done(start.0)
 
-    start.1 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=2,
+    recv-data.1 = u32[2] collective-permute(send-data), channel_id=2,
       source_target_pairs={{0,1}, {1,2}, {2,3}}
-    recv-data.1 = u32[2] collective-permute-done(start.1)
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -320,11 +299,17 @@ TEST_F(CollectivePermuteDecomposerTest, ForwardPipeline2) {
       recv1->ToString(),
       HasSubstr("_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3}}\""));
   EXPECT_THAT(recv1->ToString(), HasSubstr("_xla_send_recv_pipeline=\"1\""));
+  HloInstruction* recv_done1 = FindInstruction(module.get(), "recv-done.1");
+  EXPECT_THAT(recv_done1->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"1\""));
   HloInstruction* send1 = FindInstruction(module.get(), "send.1");
   EXPECT_THAT(
       send1->ToString(),
       HasSubstr("_xla_send_recv_source_target_pairs=\"{{0,1},{1,2},{2,3}}\""));
   EXPECT_THAT(send1->ToString(), HasSubstr("_xla_send_recv_pipeline=\"1\""));
+  HloInstruction* send_done1 = FindInstruction(module.get(), "send-done.1");
+  EXPECT_THAT(send_done1->ToString(),
+              HasSubstr("_xla_send_recv_pipeline=\"1\""));
 }
 
 TEST_F(CollectivePermuteDecomposerTest, BackwardPipeline2) {
@@ -342,13 +327,11 @@ TEST_F(CollectivePermuteDecomposerTest, BackwardPipeline2) {
     count = get-tuple-element(param), index=0
     send-data = get-tuple-element(param), index=1
 
-    start.0 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=1,
+    recv-data.0 = u32[2] collective-permute(send-data), channel_id=1,
       source_target_pairs={{1,0},{2,1},{3,2}}
-    recv-data.0 = u32[2] collective-permute-done(start.0)
 
-    start.1 = (u32[2], u32[2]) collective-permute-start(send-data), channel_id=2,
+    recv-data.1 = u32[2] collective-permute(send-data), channel_id=2,
       source_target_pairs={{0,3}}
-    recv-data.1 = u32[2] collective-permute-done(start.1)
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 75e373e9b37777..f28d00b4d2fbd7 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -1751,12 +1751,16 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
 
     after-all = token[] after-all()
     recv = (f32[1, 1024, 1024], u32[], token[]) recv(after-all), channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}",
+      _xla_send_recv_pipeline="0"
     }
     send = (f32[1, 1024, 1024], u32[], token[]) send(p, after-all), channel_id=1, frontend_attributes={
-      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}"
+      _xla_send_recv_source_target_pairs="{{0, 1}, {1, 2}, {2, 3}, {3, 4}}",
+      _xla_send_recv_pipeline="0"
+    }
+    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1, frontend_attributes={
+       _xla_send_recv_pipeline="0"
     }
-    recv-done = (f32[1, 1024, 1024], token[]) recv-done(recv), channel_id=1
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done), index=0
 
     replica = u32[] replica-id()
@@ -1769,7 +1773,9 @@ TEST_F(CollectivePipelinerTest, TransformRecvSendBackwards) {
     d = f32[1, 1024, 1024] tan(c)
     s = f32[1, 1024, 1024] dot(c, d), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
 
-    send-done = token[] send-done(send), channel_id=1
+    send-done = token[] send-done(send), channel_id=1, frontend_attributes={
+       _xla_send_recv_pipeline="0"
+    }
     ROOT result = (u32[], f32[1, 1024, 1024]) tuple(new_count, s)
   }
 
@@ -1848,7 +1854,10 @@ TEST_F(CollectivePipelinerTest,
         _xla_send_recv_source_target_pairs="{{3,0}}",
         _xla_other_attr="0"
       }
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
@@ -1857,7 +1866,10 @@ TEST_F(CollectivePipelinerTest,
     r = u32[2] broadcast(c1), dimensions={}
     s = u32[2] add(r, recv-data)
 
-    send-done.0 = token[] send-done(send.0), channel_id=1
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     ROOT result = (u32[], u32[2]) tuple(new_count, s)
   }
 
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 2f3ef5973bbe9e..bfb6b45c043689 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -448,7 +448,7 @@ class AotCompilationOptions {
     return target_config_;
   }
   void set_target_config(const Compiler::TargetConfig& target_config) {
-    target_config_ = std::move(target_config);
+    target_config_ = target_config;
   }
 
  protected:
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 0bcefd568698d5..a69aafc5c113f3 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -3,21 +3,6 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load(
-    "//xla:xla.bzl",
-    "ORC_JIT_MEMORY_MAPPER_TARGETS",
-    "xla_cc_binary",
-    "xla_cc_test",
-)
-load(
-    "//xla/tsl/mkl:build_defs.bzl",
-    "mkl_deps",
-)
-load(
-    "//third_party/compute_library:build_defs.bzl",
-    "acl_deps",
-    "if_enable_acl",
-)
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
@@ -29,6 +14,21 @@ load(
     "if_llvm_x86_available",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "//third_party/compute_library:build_defs.bzl",
+    "acl_deps",
+    "if_enable_acl",
+)
+load(
+    "//xla:xla.bzl",
+    "ORC_JIT_MEMORY_MAPPER_TARGETS",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load(
+    "//xla/tsl/mkl:build_defs.bzl",
+    "mkl_deps",
+)
 load(":build_defs.bzl", "runtime_copts")
 
 package(
@@ -1602,7 +1602,7 @@ cc_library(
     srcs = ["onednn_matmul.cc"],
     hdrs = [
         "onednn_matmul.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1627,7 +1627,7 @@ cc_library(
     srcs = ["onednn_layer_norm.cc"],
     hdrs = [
         "onednn_layer_norm.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1650,7 +1650,7 @@ cc_library(
     srcs = ["onednn_softmax.cc"],
     hdrs = [
         "onednn_softmax.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = runtime_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
@@ -1680,7 +1680,7 @@ cc_library(
     hdrs = [
         "onednn_matmul.h",
         "onednn_matmul_rewriter.h",
-        "@local_tsl//tsl/util:onednn_util_hdrs",
+        "//xla/tsl/util:onednn_util_hdrs",
     ],
     copts = tsl_copts(),
     deps = [
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index e32159c8044c5e..7e82b08f3da5db 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -45,4 +45,5 @@ message OneDnnLayerNormConfig {
     SCALE_AND_SHIFT = 3;
   }
   FusionKind fused_ops = 1;
+  int32 epsilon_typecast = 2;
 }
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 263dbbb0c5ec28..a5603c5b63f669 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -711,12 +711,17 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
+  FloatSupport bf16_support(BF16);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-  CpuFloatSupport bf16_support(BF16);
+  CpuFloatSupport onednn_bf16_support(BF16);
+  if (!is_aot_compile) {
+    pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
+  } else {
+    pipeline.AddPass<FloatNormalization>(&bf16_support);
+  }
 #else
-  FloatSupport bf16_support(BF16);
-#endif
   pipeline.AddPass<FloatNormalization>(&bf16_support);
+#endif
   FloatSupport f8e5m2_support(F8E5M2, F16);
   pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
   FloatSupport f8e4m3fn_support(F8E4M3FN, F16);
@@ -905,14 +910,20 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
+    auto debug_options = module->config().debug_options();
     // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
     // easier to match.
-    pipeline.AddPass<SimplifyFPConversions>();
+    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
     pipeline.AddPass<OneDnnMatMulRewriter>(max_parallelism,
                                            compile_options.thread_pool);
     // Run SimplifyFPConversions pass again to remove redundant Convert ops
     // that may exist as a result of running OneDnnMatMulRewriter pass.
-    pipeline.AddPass<SimplifyFPConversions>();
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.cc b/third_party/xla/xla/service/cpu/cpu_float_support.cc
index 336a8709725ea7..0bb4dd8e875a75 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.cc
@@ -24,6 +24,10 @@ namespace cpu {
 
 bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
   switch (hlo.opcode()) {
+    // oneDNN rewritable ops
+    case HloOpcode::kDot:
+      return LowPrecisionType() == BF16 &&
+             OneDnnMatMulRewriter::ShouldRewrite(&hlo);
     // Collective ops.
     case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
@@ -32,9 +36,6 @@ bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kReduceScatter:
-    case HloOpcode::kDot:
-      return LowPrecisionType() == BF16 &&
-             OneDnnMatMulRewriter::ShouldRewrite(&hlo) && DotSupported(hlo);
     // Data movement only ops.
     case HloOpcode::kBroadcast:
     case HloOpcode::kConcatenate:
@@ -58,18 +59,6 @@ bool CpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
   }
 }
 
-bool CpuFloatSupport::DotSupported(const HloInstruction& hlo) const {
-  bool supported = true;
-  const Shape& lhs_shape = hlo.operand(0)->shape();
-  const Shape& rhs_shape = hlo.operand(1)->shape();
-  if (lhs_shape.rank() == rhs_shape.rank() && lhs_shape.rank() == 2) {
-    // If first dim size is 1, it may be removed by a later pass which makes it
-    // unsupported case.
-    supported &= lhs_shape.dimensions(0) != 1;
-  }
-  return supported;
-}
-
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index 6cc266d9d8d3e8..1bcab94dae3df8 100644
--- a/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/third_party/xla/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -179,13 +179,13 @@ static Status CreateHloXlaPipeline(
   }
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createCanonicalizerPass());
-  mlir::bufferization::BufferResultsToOutParamsOptions out_params_options;
-  out_params_options.filterFn = [](mlir::func::FuncOp* func) {
+  mlir::bufferization::BufferResultsToOutParamsOpts out_params_opts;
+  out_params_opts.filterFn = [](mlir::func::FuncOp* func) {
     // Only transform the entry point.
     return func->getSymName() == "main";
   };
-  pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass(
-      out_params_options));
+  pm.addPass(
+      mlir::bufferization::createBufferResultsToOutParamsPass(out_params_opts));
 
   pm.addNestedPass<FuncOp>(
       mlir::bufferization::createPromoteBuffersToStackPass(nullptr));
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
index 79a6a2290a063a..d2109a1bc2f956 100644
--- a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla {
@@ -83,7 +83,8 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnLayerNorm(
   auto shift_mem = memory(scaleshift_md, cpu_engine, beta_minfo.Data());
 
   // TODO(intel-tf): Move epsilon to OneDnnLayerNormConfig.
-  const float epsilon = 1.e-5f;
+  float epsilon;
+  *(reinterpret_cast<int32_t*>(&epsilon)) = ln_config.epsilon_typecast();
 
   auto lnorm_pd = layer_normalization_forward::primitive_desc(
       cpu_engine, prop_kind::forward_inference, src_md, dst_md, epsilon,
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index 3686827198df7a..4c01c732a96da9 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -35,8 +35,8 @@ limitations under the License.
 #include "xla/service/cpu/runtime_lightweight_check.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/onednn_threadpool.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index 840661310f8744..c9b9e9b4a04a41 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -29,10 +29,11 @@ limitations under the License.
 #include "xla/service/cpu/onednn_matmul.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/onednn_util.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/util/onednn_threadpool.h"
 
 namespace xla {
 namespace cpu {
@@ -55,16 +56,6 @@ inline Status ValidateDotDimensionNumbers(
   return OkStatus();
 }
 
-// We also check if the convert instruction has only one use.
-inline bool AllOperandsConvertedFromBF16ToF32(const HloInstruction* instr) {
-  return absl::c_all_of(instr->operands(), [](HloInstruction* operand) {
-    return Match(operand,
-                 m::Convert(m::Op().WithElementType(PrimitiveType::BF16))
-                     .WithElementType(PrimitiveType::F32)
-                     .WithOneUse());
-  });
-}
-
 template <typename Pattern>
 auto ElementwiseSafeIntermediate(HloInstruction** instr, Pattern pattern) {
   return m::AnyOf<HloInstruction>(m::Broadcast(instr, pattern.WithOneUser()),
@@ -254,10 +245,10 @@ inline bool IsRowMajor(const Shape& shape) {
 // TODO(intel-tf): Restict compatible types based on instruction kind.
 inline bool CompatibleElementType(const HloInstruction* instr) {
   PrimitiveType element_type = instr->shape().element_type();
-  return element_type == BF16 || element_type == F32;
+  return element_type == BF16 || element_type == F32 || element_type == F16;
 }
 
-// Type conversion from and to any of BF16 and FP32.
+// Type conversion from and to any of BF16, F16 and FP32.
 // TODO(intel-tf): Support more types when enabled.
 template <typename Pattern>
 inline auto SupportedConvert(Pattern pattern) {
@@ -305,14 +296,13 @@ inline auto OptionalConvertAndBitcast(HloInstruction** optional_convert,
   // Checks the presence of some intermediate operations that can be moved /
   // folded to allow dot fusion with add.
   // Try to match either of the following:
-  //   1. pattern-root -> bf16-to-fp32 convert -> bitcast
-  //   2. pattern-root -> bf16-to-fp32 convert
+  //   1. pattern-root -> bf16/f16-to-fp32 convert -> bitcast
+  //   2. pattern-root -> bf16/f16-to-fp32 convert
   //   3. pattern-root -> bitcast
   //   4. pattern-root
   auto common =
       m::AnyOf<HloInstruction>(
           SupportedConvert(optional_convert, std::move(pattern).WithOneUser())
-              .WithOperand(0, m::Op().WithElementType(PrimitiveType::BF16))
               .WithElementType(PrimitiveType::F32),
           std::move(pattern).WithOneUser())
           .WithOneUser();
@@ -343,12 +333,15 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
       ShapeUtil::IsZeroElementArray(output_shape)) {
     return false;
   }
-  // OneDNN only supports 2 <= rank <= kOneDnnMaxNDims.
-  if (lhs_shape.rank() != rhs_shape.rank() ||
-      rhs_shape.rank() != output_shape.rank() || lhs_shape.rank() < 2 ||
-      lhs_shape.rank() > kOneDnnMaxNDims) {
+  // OneDNN only supports rank <= kOneDnnMaxNDims and singular non-contracting
+  // dimensions. We should not rewrite if any of these conditions are violated.
+  if (lhs_shape.rank() <= 0 || lhs_shape.rank() > kOneDnnMaxNDims ||
+      rhs_shape.rank() <= 0 || rhs_shape.rank() > kOneDnnMaxNDims ||
+      output_shape.rank() > std::min({lhs_shape.rank(), rhs_shape.rank(),
+                                      static_cast<int64_t>(kOneDnnMaxNDims)})) {
     return false;
   }
+
   // Layout should be row-major, contraction dimensions captures transpose
   // scenarios in last two dimensions.
   if (!IsRowMajor(lhs_shape) || !IsRowMajor(rhs_shape) ||
@@ -365,15 +358,15 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
   }
 
   // OneDNN matmul has scratch allocation and copy overheads. The overheads
-  // can be amortized if there is sufficient MAC (multiply-accumulate)
-  // operations. We don't rewrite for small cases (determined empirically).
+  // can be amortized if there is sufficient number of flops. We don't rewrite
+  // for small cases (determined empirically).
   // TODO(intel-tf): Relax the condition when more optimizations in oneDNN
   // matmul is achieved.
-  auto rank = lhs_shape.rank();
-  auto rhs_dims = rhs_shape.dimensions();
-  int64_t num_mac_ops = ShapeUtil::ElementsIn(lhs_shape) * rhs_dims.back();
-  int mac_ops_threshold = (rank == 2) ? (1 << 23) : (1 << 18);
-  return (num_mac_ops >= mac_ops_threshold);
+  auto num_flops = xla::HloCostAnalysis::GetDotFlops(lhs_shape, output_shape,
+                                                     dot_dim_numbers);
+  auto rank = output_shape.rank();
+  auto flops_threshold = (rank <= 2) ? (1 << 24) : (1 << 19);
+  return (num_flops >= flops_threshold);
 }
 
 class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
@@ -385,9 +378,11 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     auto pattern = m::Op(&dot_instr).WithOpcode(HloOpcode::kDot);
     if (!Match(instr, pattern)) return OkStatus();
 
-    auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
-    TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot_dim_numbers));
+    TF_RETURN_IF_ERROR(
+        ValidateDotDimensionNumbers(dot_instr->dot_dimension_numbers()));
     if (!OneDnnMatMulRewriter::ShouldRewrite(dot_instr)) return OkStatus();
+    TF_ASSIGN_OR_RETURN(dot_instr, ReconfigureDotDimensions(dot_instr));
+    auto dot_dim_numbers = dot_instr->dot_dimension_numbers();
     const Shape& lhs_shape = dot_instr->operand(0)->shape();
     const Shape& rhs_shape = dot_instr->operand(1)->shape();
     const Shape& output_shape = dot_instr->shape();
@@ -413,30 +408,6 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
-  Status HandleConvert(HloInstruction* convert) override {
-    HloInstruction* matmul_instr;
-    auto pattern =
-        m::Convert(m::CustomCall(&matmul_instr, {"__onednn$matmul"})
-                       .WithOneUse()
-                       .WithElementType(PrimitiveType::F32)
-                       .WithPredicate(AllOperandsConvertedFromBF16ToF32))
-            .WithElementType(PrimitiveType::BF16);
-
-    if (!Match(convert, pattern)) return OkStatus();
-    if (!IsSupportedType(convert->shape().element_type())) return OkStatus();
-
-    // BFloat16 operands.
-    std::vector<HloInstruction*> bf16_operands;
-    for (auto operand : matmul_instr->operands()) {
-      bf16_operands.push_back(operand->mutable_operand(0));
-    }
-
-    HloInstruction* matmul_call = convert->AddInstruction(
-        matmul_instr->CloneWithNewOperands(convert->shape(), bf16_operands));
-    TF_RETURN_IF_ERROR(ReplaceInstruction(convert, matmul_call));
-    return OkStatus();
-  }
-
   Status HandleAdd(HloInstruction* instr) override {
     // Try to do a fusion for Dot(onednn-matmul) + Add. However,
     // HLO Add instruction might receive the addends after additional
@@ -467,7 +438,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
         OptionalConvertAndBitcast(&optional_dot_convert, &optional_dot_bitcast,
                                   OneDnnMatmulInstr(&dot))
             .WithOneUser(),
-        m::Op(&addend_intermediate).WithOneUser());
+        m::Op(&addend_intermediate));
 
     if (Match(instr, pattern)) {
       if (!IsSupportedType(dot->shape().element_type())) return OkStatus();
@@ -544,30 +515,34 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       HloInstruction* new_instr;
       // If matched pattern has custom-call -> bitcast -> add, then we need to
       // insert bitcast after the new fusion to maintain the correct shape
-      // (new-custom-call -> bitcast). Also, this will be followed by -> convert
-      // for bf16 case to avoid datatype mismatch.
+      // (new-custom-call -> bitcast). Also, this will optionally be followed
+      // by -> convert for bf16 case to avoid datatype mismatch.
       if (optional_dot_bitcast != nullptr &&
           optional_dot_bitcast->opcode() == HloOpcode::kBitcast) {
-        if (matmul_call->shape().element_type() == PrimitiveType::BF16) {
+        if (optional_dot_convert != nullptr &&
+            optional_dot_convert->opcode() == HloOpcode::kConvert) {
           auto bitcast_call =
               matmul_call->AddInstruction(HloInstruction::CreateBitcast(
-                  ShapeUtil::ChangeElementType(instr->shape(),
-                                               PrimitiveType::BF16),
+                  ShapeUtil::ChangeElementType(
+                      instr->shape(), matmul_call->shape().element_type()),
                   matmul_call));
           new_instr =
               bitcast_call->AddInstruction(HloInstruction::CreateConvert(
-                  ShapeUtil::ChangeElementType(bitcast_call->shape(),
-                                               PrimitiveType::F32),
+                  ShapeUtil::ChangeElementType(
+                      bitcast_call->shape(),
+                      optional_dot_convert->shape().element_type()),
                   bitcast_call));
         } else {
           new_instr = matmul_call->AddInstruction(
               HloInstruction::CreateBitcast(instr->shape(), matmul_call));
         }
       } else {
-        if (matmul_call->shape().element_type() == PrimitiveType::BF16) {
+        if (optional_dot_convert != nullptr &&
+            optional_dot_convert->opcode() == HloOpcode::kConvert) {
           new_instr = matmul_call->AddInstruction(HloInstruction::CreateConvert(
-              ShapeUtil::ChangeElementType(matmul_call->shape(),
-                                           PrimitiveType::F32),
+              ShapeUtil::ChangeElementType(
+                  matmul_call->shape(),
+                  optional_dot_convert->shape().element_type()),
               matmul_call));
         } else {
           new_instr = matmul_call;
@@ -616,7 +591,7 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                                .WithOneUser()
                                .WithOpcode(HloOpcode::kCustomCall)
                                .WithCustomCallTarget({"__onednn$matmul"}),
-                           m::Broadcast(m::Constant(&constant)).WithOneUser());
+                           m::Broadcast(m::Constant(&constant)));
 
     if (Match(instr, pattern)) {
       std::vector<HloInstruction*> new_operands;
@@ -659,6 +634,83 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
 
     return ReplaceWithNewInstruction(activation, std::move(output));
   }
+
+  // This function changes dot instruction for supported matrix
+  // multiplication scenarios. In particular, it changes the shape
+  // of lhs, rhs and result arrays.
+  //    - lhs configuration scenario
+  //      lhs:    [batch_dims,contracting_dim] to [batch_dims,1,contracting_dim]
+  //      result: [batch_dims,feature_dim] to [batch_dims,1,feature_dim]
+  //
+  //    - rhs configuration scenario
+  //      rhs:    [batch_dims,contracting_dim] to [batch_dims,contracting_dim,1]
+  //      result: [batch_dims,feature_dim] to [batch_dims,feature_dim, 1]
+  //
+  //    - both lhs and rhs configuration scenario
+  //      lhs:    [batch_dims,contracting_dim] to [batch_dims,1,contracting_dim]
+  //      rhs:    [batch_dims,contracting_dim] to [batch_dims,contracting_dim,1]
+  //      result: [batch_dims] to [batch_dims,1,1]
+  StatusOr<HloInstruction*> ReconfigureDotDimensions(
+      HloInstruction* dot_instr) {
+    HloInstruction* lhs = dot_instr->mutable_operand(0);
+    HloInstruction* rhs = dot_instr->mutable_operand(1);
+    DotDimensionNumbers dim_numbers = dot_instr->dot_dimension_numbers();
+
+    auto lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
+    auto lhs_contraction_dims = dim_numbers.lhs_contracting_dimensions();
+    bool is_lhs_vector = lhs->shape().rank() ==
+                         (lhs_batch_dims.size() + lhs_contraction_dims.size());
+
+    auto rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
+    auto rhs_contraction_dims = dim_numbers.rhs_contracting_dimensions();
+    bool is_rhs_vector = rhs->shape().rank() ==
+                         (rhs_batch_dims.size() + rhs_contraction_dims.size());
+
+    if (!is_lhs_vector && !is_rhs_vector) return dot_instr;
+
+    std::vector<int64_t> adjusted_lhs_dims(lhs->shape().dimensions().begin(),
+                                           lhs->shape().dimensions().end());
+    std::vector<int64_t> adjusted_rhs_dims(rhs->shape().dimensions().begin(),
+                                           rhs->shape().dimensions().end());
+    std::vector<int64_t> adjusted_dot_dims(
+        dot_instr->shape().dimensions().begin(),
+        dot_instr->shape().dimensions().end());
+
+    if (is_lhs_vector) {
+      auto lhs_it = adjusted_lhs_dims.begin() + lhs_batch_dims.size();
+      adjusted_lhs_dims.insert(lhs_it, 1, 1);
+      auto result_it = adjusted_dot_dims.begin() + lhs_batch_dims.size();
+      adjusted_dot_dims.insert(result_it, 1, 1);
+      auto lhs_contraction_dim =
+          dot_instr->dot_dimension_numbers().lhs_contracting_dimensions(0);
+      dim_numbers.set_lhs_contracting_dimensions(0, lhs_contraction_dim + 1);
+      lhs = lhs->AddInstruction(HloInstruction::CreateBitcast(
+          ShapeUtil::MakeShape(lhs->shape().element_type(), adjusted_lhs_dims),
+          lhs));
+    }
+
+    if (is_rhs_vector) {
+      auto it = adjusted_rhs_dims.end();
+      adjusted_rhs_dims.insert(it, 1, 1);
+      auto result_it = adjusted_dot_dims.end();
+      adjusted_dot_dims.insert(result_it, 1, 1);
+      rhs = rhs->AddInstruction(HloInstruction::CreateBitcast(
+          ShapeUtil::MakeShape(rhs->shape().element_type(), adjusted_rhs_dims),
+          rhs));
+    }
+
+    HloInstruction* adjusted_dot =
+        dot_instr->AddInstruction(HloInstruction::CreateDot(
+            ShapeUtil::MakeShape(dot_instr->shape().element_type(),
+                                 adjusted_dot_dims),
+            lhs, rhs, dim_numbers, dot_instr->precision_config()));
+
+    HloInstruction* replacement_instr = adjusted_dot->AddInstruction(
+        HloInstruction::CreateBitcast(dot_instr->shape(), adjusted_dot));
+
+    TF_RETURN_IF_ERROR(ReplaceInstruction(dot_instr, replacement_instr));
+    return adjusted_dot;
+  }
 };
 
 class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
index 4c508e811aa473..36cab7ee949c37 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
@@ -37,7 +37,7 @@ class OneDnnMatMulRewriter : public HloModulePass {
                        const tsl::thread::ThreadPool* compile_threadpool)
       : intra_op_parallelism_(intra_op_parallelism),
         compile_threadpool_(compile_threadpool) {}
-
+  OneDnnMatMulRewriter() = default;
   absl::string_view name() const override { return "onednn-matmul-rewriter"; }
 
   using HloPassInterface::Run;
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index ed904f44658876..058355223cd5ee 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -29,16 +29,16 @@ namespace cpu {
 namespace {
 namespace m = match;
 
-auto ConvertPattern(HloInstruction** instr) {
-  return m::Convert(m::Op(instr).WithElementType(PrimitiveType::BF16))
-      .WithElementType(PrimitiveType::F32);
-}
-
 template <typename Pattern>
 auto OptionalConvert(Pattern pattern) {
   return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
 }
 
+inline auto OneDnnConvertibleInstr(HloInstruction** instr) {
+  return m::AnyOf<HloInstruction>(m::CustomCall(instr, {"__onednn$layernorm"}),
+                                  m::CustomCall(instr, {"__onednn$softmax"}));
+}
+
 HloInstruction* FindLayerNormScale(HloInstruction* instr) {
   HloInstruction* scale = nullptr;
   auto scalePattern = m::Multiply().WithBinaryOperandsAnyOrder(
@@ -98,29 +98,30 @@ std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
   HloInstruction* right_producer;
 
   // Lower diamond
-  if (!Match(
-          instr,
-          m::Divide(
-              m::Exp(&left_exponential, m::Op()),
-              m::Broadcast(m::Reshape(m::Broadcast(OptionalConvert(m::Reshape(
-                  m::Reduce(
-                      OptionalConvert(m::Exp(&right_exponential, m::Op())),
-                      m::Op())
-                      .WithPredicate([](const HloInstruction* reduce) {
-                        HloComputation* reducer = reduce->to_apply();
-                        return (reducer->root_instruction()->opcode() ==
-                                    HloOpcode::kAdd &&
-                                reduce->dimensions().size() == 1 &&
-                                reduce->dimensions()[0] !=
-                                    reduce->shape().rank() - 1);
-                      })
-                      .WithOneUse())))))))) {
+  if (!Match(instr,
+             m::Divide(
+                 m::Exp(&left_exponential, m::Op()),
+                 m::Broadcast(m::Reshape(
+                     m::Broadcast(OptionalConvert(m::Reshape(OptionalConvert(
+                         m::Reduce(OptionalConvert(
+                                       m::Exp(&right_exponential, m::Op())),
+                                   m::Op())
+                             .WithPredicate([](const HloInstruction* reduce) {
+                               HloComputation* reducer = reduce->to_apply();
+                               return (reducer->root_instruction()->opcode() ==
+                                           HloOpcode::kAdd &&
+                                       reduce->dimensions().size() == 1 &&
+                                       reduce->dimensions()[0] !=
+                                           reduce->shape().rank() - 1);
+                             })
+                             .WithOneUse()))))))))) {
     return std::nullopt;
   }
 
   if (left_exponential != right_exponential ||
-      left_exponential->user_count() != 2)
+      left_exponential->user_count() != 2) {
     return std::nullopt;
+  }
 
   // Upper diamond
   if (!Match(left_exponential->mutable_operand(0),
@@ -143,183 +144,304 @@ std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr) {
     return std::nullopt;
   }
 
-  if (left_producer != right_producer || left_producer->user_count() != 2)
+  if (left_producer != right_producer || left_producer->user_count() != 2) {
     return std::nullopt;
+  }
 
   return left_producer;
 }
 
+auto MeanPattern(HloInstruction** input) {
+  return m::Reshape(
+      m::Convert(m::Divide(m::Reduce(m::Convert(m::Op(input)), m::Op()),
+                           m::Broadcast(m::Convert()))));
+}
+
+template <typename Pattern>
+auto Square(Pattern pattern) {
+  return m::Multiply()
+      .WithBinaryOperandsAnyOrder(pattern, pattern)
+      .WithPredicate([](const HloInstruction* instr) {
+        return instr->unique_operands().size() == 1;
+      });
+}
+
+std::optional<bool> MatchTFKerasLayerNorm(HloInstruction* instr,
+                                          HloInstruction** src,
+                                          HloInstruction** scale,
+                                          HloInstruction** bias, float* eps) {
+  // variance = Mean((X - Mean(x))^2)
+  // Z = scale / sqrt(variance + eps)
+  // LN(X) = X*Z + Bias - Mean(X)*Z
+
+  HloInstruction *src_a, *src_b, *src_c;
+  HloInstruction *bias_node, *scaled_norm_a, *scaled_norm_b, *mean0_a, *epsilon,
+      *sqrd_diff_mean, *scale_node, *sqrd_diff;
+
+  // First Match X*Z + Bias - Mean(X)*Z
+  if (!Match(
+          instr,
+          m::Add().WithBinaryOperandsAnyOrder(
+              m::Multiply()
+                  .WithBinaryOperandsAnyOrder(m::Op(src), m::Op(&scaled_norm_a))
+                  .WithOneUser(),
+              m::Subtract(m::Op(&bias_node),
+                          m::Multiply().WithBinaryOperandsAnyOrder(
+                              m::Broadcast(m::Reshape(m::Op(&mean0_a))),
+                              m::Op(&scaled_norm_b)))
+                  .WithOneUser()))) {
+    return std::nullopt;
+  }
+
+  if (scaled_norm_a != scaled_norm_b) return std::nullopt;
+
+  const Shape& src_shape = (*src)->shape();
+  if (!IsSupportedType(src_shape.element_type())) return std::nullopt;
+
+  // Get bias
+  if (!Match(bias_node, m::Broadcast(m::Op(bias)))) return std::nullopt;
+
+  // Match Z = scale / sqrt(variance + eps)
+  if (!Match(scaled_norm_a,
+             m::Multiply().WithBinaryOperandsAnyOrder(
+                 m::Op(&scale_node),
+                 m::Broadcast(
+                     m::Reshape(m::Rsqrt(m::Add().WithBinaryOperandsAnyOrder(
+                         m::Broadcast(m::ConstantScalar(&epsilon)),
+                         m::Op(&sqrd_diff_mean)))))))) {
+    return std::nullopt;
+  }
+
+  // get epsilon
+  *eps = static_cast<float>(epsilon->literal().GetAsDouble({}).value());
+  // get scale
+  if (!Match(scale_node, m::Broadcast(m::Op(scale)))) return std::nullopt;
+
+  // match variance
+  if (!Match(sqrd_diff_mean, MeanPattern(&sqrd_diff))) return std::nullopt;
+
+  if (!Match(sqrd_diff, Square(m::Subtract(
+                            m::Op(&src_a),
+                            m::Broadcast(m::Reshape(MeanPattern(&src_b))))))) {
+    return std::nullopt;
+  }
+
+  if (src_a != src_b && src_a != *src) return std::nullopt;
+
+  // Match mean from Bias - Mean(X)*Z
+  if (!Match(mean0_a, MeanPattern(&src_c))) return std::nullopt;
+
+  if (src_c != *src) return std::nullopt;
+
+  return true;
+}
+
+bool MatchFlaxLayerNorm(HloInstruction* instr, HloInstruction** src,
+                        HloInstruction** scale, HloInstruction** bias,
+                        float* eps, bool* is_bf16orfp16_convert,
+                        bool* is_producer_bf16orfp16,
+                        HloInstruction** convert_instr) {
+  HloInstruction *prod_s, *hinge;
+  HloInstruction *div0, *div1, *div_red;
+  HloInstruction *mul_in0, *mul_in1, *main_pipe_mul_in0;
+  HloInstruction *reduce_in0, *epsilon;
+  HloInstruction *broadcast0, *broadcast1;
+
+  bool scaleFound = false;
+  bool shiftFound = false;
+
+  auto spine = m::Add().WithBinaryOperandsAnyOrder(
+      m::Broadcast(),
+      m::Multiply()
+          .WithBinaryOperandsAnyOrder(
+              m::Op(&hinge).WithOneUser(),
+              m::Subtract(
+                  OptionalConvert(m::Op(&prod_s)),
+                  m::Broadcast(
+                      m::Reshape(
+                          m::Broadcast(m::Reshape(m::Op(&div_red).WithOpcode(
+                                                      HloOpcode::kDivide))
+                                           .WithOneUser())
+                              .WithOneUser())
+                          .WithOneUser())
+                      .WithOneUser())
+                  .WithOneUser())
+          .WithOneUser());
+
+  if (!Match(instr, spine)) return false;
+
+  const Shape& prod_shape = prod_s->shape();
+  if (!IsSupportedType(prod_shape.element_type())) return false;
+
+  HloInstruction* shift = FindLayerNormShift(instr);
+  shiftFound = (shift != nullptr);
+
+  HloInstruction* scale_gamma = FindLayerNormScale(hinge);
+  scaleFound = (scale_gamma != nullptr);
+
+  // Currently patterns without scale and shift are not supported.
+  // OneDNN only supports 2 <= rank <= 5
+  if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
+      !scaleFound) {
+    return false;
+  }
+
+  // NOLINTBEGIN
+  auto main_pipeline = m::Multiply().WithBinaryOperandsAnyOrder(
+      m::Op(),
+      m::Broadcast(
+          m::Reshape(
+              m::Broadcast(
+                  m::Rsqrt(
+                      m::Add()
+                          .WithBinaryOperandsAnyOrder(
+                              m::Broadcast(m::ConstantScalar(&epsilon)),
+                              m::Reshape(
+                                  m::Maximum()
+                                      .WithBinaryOperandsAnyOrder(
+                                          m::Broadcast(),
+                                          m::Subtract(
+                                              m::Op(&div0).WithOpcode(
+                                                  HloOpcode::kDivide),
+                                              m::Multiply()
+                                                  .WithBinaryOperandsAnyOrder(
+                                                      m::Op(&main_pipe_mul_in0),
+                                                      m::Op(&div1).WithOpcode(
+                                                          HloOpcode::kDivide))
+                                                  .WithOneUser())
+                                              .WithOneUser())
+                                      .WithOneUser())
+                                  .WithOneUser())
+                          .WithOneUser())
+                      .WithOneUser())
+                  .WithOneUser())
+              .WithOneUser())
+          .WithOneUser());
+  // NOLINTEND
+
+  if (!Match(hinge, main_pipeline)) return false;
+
+  if ((div_red != div1) || (main_pipe_mul_in0 != div1)) return false;
+
+  auto div_red_mul_src =
+      m::Divide()
+          .WithOperand(0, m::Reduce(m::Multiply().WithBinaryOperandsAnyOrder(
+                                        OptionalConvert(m::Op(&mul_in0)),
+                                        OptionalConvert(m::Op(&mul_in1))),
+                                    m::Constant())
+                              .WithPredicate([](const HloInstruction* reduce) {
+                                HloComputation* reducer = reduce->to_apply();
+                                return (reducer->root_instruction()->opcode() ==
+                                            HloOpcode::kAdd &&
+                                        reduce->dimensions().size() == 1 &&
+                                        reduce->dimensions()[0] ==
+                                            reduce->shape().rank());
+                              }))
+          .WithOperand(1, m::Op(&broadcast0).WithOpcode(HloOpcode::kBroadcast))
+          .WithOneUser();
+
+  if (!Match(div0, div_red_mul_src)) return false;
+
+  if (mul_in0 != mul_in1) return false;
+
+  auto div_red_subgraph =
+      m::Divide()
+          .WithOperand(
+              0,
+              m::Reduce(OptionalConvert(m::Op(&reduce_in0)), m::Constant())
+                  .WithPredicate([](const HloInstruction* reduce) {
+                    HloComputation* reducer = reduce->to_apply();
+                    return (reducer->root_instruction()->opcode() ==
+                                HloOpcode::kAdd &&
+                            reduce->dimensions().size() == 1 &&
+                            reduce->dimensions()[0] == reduce->shape().rank());
+                  }))
+          .WithOperand(1, m::Op(&broadcast1).WithOpcode(HloOpcode::kBroadcast));
+
+  if (!Match(div1, div_red_subgraph)) return false;
+
+  if (broadcast1 != broadcast0 || reduce_in0 != mul_in0 || mul_in0 != prod_s) {
+    return false;
+  }
+
+  *is_producer_bf16orfp16 =
+      (prod_s->shape().element_type() == PrimitiveType::F16) ||
+      (prod_s->shape().element_type() == PrimitiveType::BF16);
+  if (instr->user_count() == 1 &&
+      instr->users().at(0)->opcode() == HloOpcode::kConvert) {
+    *convert_instr = instr->users().at(0);
+    *is_bf16orfp16_convert =
+        ((*convert_instr)->shape().element_type() == PrimitiveType::F16 ||
+         (*convert_instr)->shape().element_type() == PrimitiveType::BF16);
+  }
+
+  *src = prod_s;
+  *scale = scale_gamma;
+  *bias = shift;
+  // get epsilon
+  *eps = static_cast<float>(epsilon->literal().GetAsDouble({}).value());
+
+  return true;
+}
+
 }  // namespace
 
 class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   Status HandleAdd(HloInstruction* instr) override {
-    HloInstruction *slicemu1, *slicemu2;
-    HloInstruction *slicesource1, *slicesource2;
-    HloInstruction *musquare1, *musquare2;
-    HloInstruction *prod_c, *prod_l, *prod_s, *prod_r;
-    HloInstruction *slicevar, *hinge;
-
-    bool scaleFound = false;
-    bool shiftFound = false;
-
-    auto spine = m::Add().WithBinaryOperandsAnyOrder(
-        m::Broadcast(),
-        m::Multiply()
-            .WithBinaryOperandsAnyOrder(
-                m::Op(&hinge).WithOneUser(),
-                m::Subtract(
-                    m::Op(&prod_s),
-                    m::Broadcast(
-                        m::Reshape(
-                            m::Broadcast(
-                                m::Reshape(
-                                    m::Op(&slicemu1)
-                                        .WithOpcode(HloOpcode::kSlice)
-                                        .WithOperand(
-                                            0, m::Op(&slicesource1)
-                                                   .WithOpcode(
-                                                       HloOpcode::kDivide)))
-                                    .WithOneUser())
-                                .WithOneUser())
-                            .WithOneUser())
-                        .WithOneUser())
-                    .WithOneUser())
-            .WithOneUser());
-
-    if (!Match(instr, spine)) {
-      return OkStatus();
+    HloInstruction *src, *scale, *bias;
+    float eps;
+    bool is_bf16orfp16_convert = false;
+    bool is_producer_bf16orfp16 = false;
+    HloInstruction* convert_instr;
+
+    bool found_ln =
+        MatchTFKerasLayerNorm(instr, &src, &scale, &bias, &eps).value_or(false);
+
+    if (!found_ln) {
+      found_ln = MatchFlaxLayerNorm(instr, &src, &scale, &bias, &eps,
+                                    &is_bf16orfp16_convert,
+                                    &is_producer_bf16orfp16, &convert_instr);
     }
 
-    const Shape& prod_shape = prod_s->shape();
-    if (!IsSupportedType(prod_shape.element_type())) return OkStatus();
+    if (!found_ln) return OkStatus();
 
-    HloInstruction* shift = FindLayerNormShift(instr);
-    shiftFound = (shift != nullptr);
+    const Shape& src_shape = src->shape();
+    auto scale_type = scale->shape().element_type();
+    auto bias_type = bias->shape().element_type();
+    HloInstruction* scale_operand = scale;
+    HloInstruction* bias_operand = bias;
 
-    HloInstruction* scale = FindLayerNormScale(hinge);
-    scaleFound = (scale != nullptr);
-
-    // Currently patterns without scale and shift are
-    // not supported.
-    // OneDNN only supports 2 <= rank <= 5
-    if (!(prod_shape.rank() >= 2 && prod_shape.rank() <= 5) || !shiftFound ||
-        !scaleFound) {
-      return OkStatus();
+    // oneDNN requires scale and shift float32
+    if ((scale_type == PrimitiveType::BF16) ||
+        (scale_type == PrimitiveType::F16)) {
+      scale_operand = instr->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(scale->shape(), PrimitiveType::F32),
+          scale));
     }
 
-    // NOLINTBEGIN
-    auto main_pipeline = m::Multiply().WithBinaryOperandsAnyOrder(
-        m::Op(),
-        m::Broadcast(
-            m::Reshape(
-                m::Broadcast(
-                    m::Rsqrt(
-                        m::Add()
-                            .WithBinaryOperandsAnyOrder(
-                                m::Broadcast(m::Constant()),
-                                m::Reshape(
-                                    m::Maximum()
-                                        .WithBinaryOperandsAnyOrder(
-                                            m::Broadcast(),
-                                            m::Subtract(
-                                                m::Reshape(
-                                                    m::Op(&slicevar)
-                                                        .WithOpcode(
-                                                            HloOpcode::kSlice)
-                                                        .WithOperand(
-                                                            0,
-                                                            m::Op(&slicesource2)
-                                                                .WithOpcode(
-                                                                    HloOpcode::
-                                                                        kDivide)))
-                                                    .WithOneUser(),
-                                                m::Multiply(
-                                                    m::Op(&musquare1),
-                                                    m::Op(&musquare2)
-                                                        .WithOperand(
-                                                            0,
-                                                            m::Op(&slicemu2)
-                                                                .WithOpcode(
-                                                                    HloOpcode::
-                                                                        kSlice)))
-                                                    .WithOneUser())
-                                                .WithOneUser())
-                                        .WithOneUser())
-                                    .WithOneUser())
-                            .WithOneUser())
-                        .WithOneUser())
-                    .WithOneUser())
-                .WithOneUser())
-            .WithOneUser());
-    // NOLINTEND
-
-    if (!Match(hinge, main_pipeline) || slicemu1 != slicemu2 ||
-        musquare1 != musquare2 || slicesource1 != slicesource2) {
-      return OkStatus();
+    if ((bias_type == PrimitiveType::BF16) ||
+        (bias_type == PrimitiveType::F16)) {
+      bias_operand = instr->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(bias->shape(), PrimitiveType::F32),
+          bias));
     }
 
-    // Check if the slices are compatible
-    if (!(absl::c_all_of(slicemu1->slice_starts(),
-                         [](int64_t i) { return i == 0; }) &&
-          absl::c_equal(slicemu1->slice_limits(),
-                        slicemu1->shape().dimensions())) &&
-        !(absl::c_all_of(slicevar->slice_starts(),
-                         [](int64_t i) { return i == 0; }) &&
-          absl::c_equal(slicevar->slice_limits(),
-                        slicevar->shape().dimensions()))) {
-      return OkStatus();
-    }
-
-    auto empirical_expectations = m::Divide(
-        m::Reduce(m::Concatenate()
-                      .WithBinaryOperandsAnyOrder(
-                          m::Reshape(m::Multiply(m::Op(&prod_l), m::Op(&prod_c))
-                                         .WithOneUser())
-                              .WithOneUser(),
-                          m::Reshape(m::Op(&prod_r)).WithOneUser())
-                      .WithPredicate([](const HloInstruction* comb) {
-                        return (comb->dimensions().size() == 1 &&
-                                comb->dimensions()[0] == 0 &&
-                                comb->shape().dimensions(0) == 2);
-                      })
-                      .WithOneUser(),
-                  m::Constant())
-            .WithPredicate([](const HloInstruction* reduce) {
-              HloComputation* reducer = reduce->to_apply();
-              return (reducer->root_instruction()->opcode() ==
-                          HloOpcode::kAdd &&
-                      reduce->dimensions().size() == 1 &&
-                      reduce->dimensions()[0] == reduce->shape().rank());
-            })
-            .WithOneUser(),
-        m::Broadcast(m::ConstantScalar().WithPredicate(
-            [orig = prod_s](const HloInstruction* divisor) {
-              std::optional<double> actual =
-                  static_cast<const HloConstantInstruction*>(divisor)
-                      ->literal()
-                      .GetAsDouble({});
-              return (actual.has_value() &&
-                      orig->shape().dimensions(orig->shape().rank() - 1) ==
-                          *actual);
-            })));
-
-    HloInstruction *src1, *src2;
-    if (Match(slicesource2, empirical_expectations) &&
-        // Float32 pattern check
-        ((prod_l == prod_c && prod_c == prod_r && prod_l == prod_s) ||
-         // Bfloat16 pattern check
-         (prod_l == prod_c && prod_c == prod_r &&
-          Match(prod_l, ConvertPattern(&src1)) &&
-          Match(prod_s, ConvertPattern(&src2)) && src1 == src2))) {
-      HloInstruction* ln_call =
-          instr->AddInstruction(HloInstruction::CreateCustomCall(
-              prod_shape, {prod_r, scale, shift}, "__onednn$layernorm"));
-      BackendConfig backend_config;
-      OneDnnLayerNormConfig* ln_config =
-          backend_config.mutable_onednn_layer_norm_config();
-      ln_config->set_fused_ops(OneDnnLayerNormConfig::SCALE_AND_SHIFT);
-      TF_RETURN_IF_ERROR(ln_call->set_backend_config(backend_config));
+    HloInstruction* ln_call =
+        instr->AddInstruction(HloInstruction::CreateCustomCall(
+            src_shape, {src, scale_operand, bias_operand},
+            "__onednn$layernorm"));
+    BackendConfig backend_config;
+    OneDnnLayerNormConfig* ln_config =
+        backend_config.mutable_onednn_layer_norm_config();
+    ln_config->set_fused_ops(OneDnnLayerNormConfig::SCALE_AND_SHIFT);
+    ln_config->set_epsilon_typecast(*(reinterpret_cast<int32_t*>(&eps)));
+    TF_RETURN_IF_ERROR(ln_call->set_backend_config(backend_config));
+
+    if (convert_instr != nullptr && is_bf16orfp16_convert &&
+        is_producer_bf16orfp16) {
+      TF_RETURN_IF_ERROR(ReplaceInstruction(convert_instr, ln_call));
+    } else {
       TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
     }
 
@@ -327,16 +449,21 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
   Status HandleConvert(HloInstruction* instr) override {
-    HloInstruction* ln_instr;
-    auto pattern = m::Convert(m::Op(&ln_instr)
-                                  .WithOneUser()
-                                  .WithOpcode(HloOpcode::kCustomCall)
-                                  .WithCustomCallTarget({"__onednn$layernorm"})
-                                  .WithElementType(PrimitiveType::F32))
-                       .WithElementType(PrimitiveType::BF16);
+    HloInstruction* custom_call;
+    HloInstruction* convert_instr;
+    auto pattern =
+        m::Op(&convert_instr)
+            .WithOpcode(HloOpcode::kConvert)
+            .WithOperand(0, OneDnnConvertibleInstr(&custom_call)
+                                .WithOneUser()
+                                .WithElementType(PrimitiveType::F32));
 
     if (!IsSupportedType(instr->shape().element_type())) return OkStatus();
     if (Match(instr, pattern)) {
+      bool is_bf16orfp16_convert =
+          (convert_instr->shape().element_type() == PrimitiveType::BF16) ||
+          (convert_instr->shape().element_type() == PrimitiveType::F16);
+      if (!is_bf16orfp16_convert) return OkStatus();
       HloInstruction* producer = instr->mutable_operand(0)->mutable_operand(0);
       HloInstruction* newinp =
           producer->AddInstruction(HloInstruction::CreateConvert(
@@ -344,11 +471,11 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
                                            instr->shape().element_type()),
               producer));
       absl::InlinedVector<HloInstruction*, 2> newoperands =
-          ln_instr->mutable_operands();
+          custom_call->mutable_operands();
       newoperands.at(0) = newinp;
-      HloInstruction* ln_call = instr->AddInstruction(
-          ln_instr->CloneWithNewOperands(instr->shape(), newoperands));
-      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, ln_call));
+      HloInstruction* updated_call = instr->AddInstruction(
+          custom_call->CloneWithNewOperands(instr->shape(), newoperands));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, updated_call));
     }
 
     return OkStatus();
diff --git a/third_party/xla/xla/service/cpu/onednn_softmax.cc b/third_party/xla/xla/service/cpu/onednn_softmax.cc
index 18efb700eb3fde..5af6de54078596 100644
--- a/third_party/xla/xla/service/cpu/onednn_softmax.cc
+++ b/third_party/xla/xla/service/cpu/onednn_softmax.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
-#include "tsl/util/onednn_threadpool.h"
+#include "xla/tsl/util/onednn_threadpool.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index c0044b17d31f58..32d785885e0ab5 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -1,9 +1,9 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/dot_merger.cc b/third_party/xla/xla/service/dot_merger.cc
index fb4f08731bd10c..71dbd9e73d3b5f 100644
--- a/third_party/xla/xla/service/dot_merger.cc
+++ b/third_party/xla/xla/service/dot_merger.cc
@@ -392,9 +392,10 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp,
         int32_t b_id = graph_id(b);
 
         if (dead_instrs.contains(a) || dead_instrs.contains(b) ||
+            (!is_merge_candidate(a) && !is_merge_candidate(b)) ||
+            // Perform reachability checks last since they can be expensive.
             graph.IsReachableNonConst(a_id, b_id) ||
-            graph.IsReachableNonConst(b_id, a_id) ||
-            (!is_merge_candidate(a) && !is_merge_candidate(b))) {
+            graph.IsReachableNonConst(b_id, a_id)) {
           continue;
         }
 
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index a3f434d412576a..4dab68b3b40cea 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -1019,16 +1019,21 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
-      TF_ASSIGN_OR_RETURN(
-          auto exp_a,
-          EmitExp(component_type, EmitExtractReal(operand_value), ""));
-      TF_ASSIGN_OR_RETURN(
-          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
-      TF_ASSIGN_OR_RETURN(
-          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
-      auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
-      auto real_result = FSub(FMul(exp_a, cos_b), one);
-      auto imag_result = FMul(exp_a, sin_b);
+      //            [handle inaccuracies when a and/or b are small]
+      //            = ((e^a - 1) * cos(b) + cos(b) - 1) + e^a*sin(b)i
+      //            = (expm1(a) * cos(b) + cosm1(b)) + e^a*sin(b)i
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto zero = llvm::ConstantFP::get(b->getType(), 0.0);
+      auto one = llvm::ConstantFP::get(b->getType(), 1.0);
+      auto b_is_zero = FCmpOEQ(b, zero);
+      TF_ASSIGN_OR_RETURN(auto expm1_a, EmitExpm1(component_type, a));
+      auto exp_a = FAdd(expm1_a, one);
+      TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
+      TF_ASSIGN_OR_RETURN(auto cos_b_minus_one, EmitCosm1(component_type, b));
+      auto cos_b = FAdd(cos_b_minus_one, one);
+      auto real_result = FAdd(FMul(expm1_a, cos_b), cos_b_minus_one);
+      auto imag_result = Select(b_is_zero, zero, FMul(exp_a, sin_b));
       return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kCos:
@@ -2018,6 +2023,45 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(
                                       {value->getType()}, b_);
 }
 
+absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitCosm1(
+    PrimitiveType prim_type, llvm::Value* value) {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto negative_half = llvm::ConstantFP::get(type, -0.5);
+  auto negative_one = llvm::ConstantFP::get(type, -1.0);
+
+  // Algorithm copied from cephes cosm1:
+  //   cosm1(x) = -0.5 * x^2 + x^4 * P(x^2);
+  // that is suitable when abs(x) < pi/4, otherwise we'll use cos(x)-1.
+  //
+  // This is an alternative algorithm
+  //   cosm1(x) = -2 * sin(x/2)^2
+  // that is only slightly less accurate around abs(x) == 0.1 but
+  // otherwise equivalent accuracy-wise compared to cephes cosm1.
+  // However, we are not using it because it is notably less
+  // performant than cephes cosm1.
+
+  // TODO: define cosm1(x) as cosm1(x mod (2*pi)) to increase accuracy
+  // for large x values that are close to 2*pi*n where n is some integer.
+  static const std::array<double, 7> kCoeffs{
+      4.7377507964246204691685E-14, -1.1470284843425359765671E-11,
+      2.0876754287081521758361E-9,  -2.7557319214999787979814E-7,
+      2.4801587301570552304991E-5,  -1.3888888888888872993737E-3,
+      4.1666666666666666609054E-2,
+  };
+  TF_ASSIGN_OR_RETURN(auto cos_x, EmitCos(prim_type, x));
+  auto for_large_x = FAdd(cos_x, negative_one);
+
+  auto xx = FMul(x, x);
+  auto xxxx = FMul(xx, xx);
+  TF_ASSIGN_OR_RETURN(auto poly, EvaluatePolynomial(type, xx, kCoeffs));
+  auto for_small_x = FAdd(FMul(xxxx, poly), FMul(negative_half, xx));
+
+  // (pi/4)^2 is approximately 0.61685
+  return Select(FCmpOGT(xx, llvm::ConstantFP::get(type, 0.61685)), for_large_x,
+                for_small_x);
+}
+
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(
     PrimitiveType prim_type, llvm::Value* value, absl::string_view name) {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
@@ -2279,7 +2323,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitPredBinaryOp(
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
       return Internal("Invalid binary op '%s' for pred",
-                           HloOpcodeString(op->opcode()));
+                      HloOpcodeString(op->opcode()));
 
     default:
       return Unimplemented("binary pred op '%s'",
@@ -3098,12 +3142,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return EmitReducePrecision(hlo, operand_value);
       };
     case HloOpcode::kConcatenate:
-      return [this, hlo,
-              &operand_to_generator](const IrArray::Index target_index)
-                 -> absl::StatusOr<llvm::Value*> {
-        return EmitElementalConcatenate(hlo, operand_to_generator,
-                                        target_index);
-      };
+      return
+          [this, hlo, &operand_to_generator](const IrArray::Index target_index)
+              -> absl::StatusOr<llvm::Value*> {
+            return EmitElementalConcatenate(hlo, operand_to_generator,
+                                            target_index);
+          };
     case HloOpcode::kReverse:
       return [this, hlo,
               &operand_to_generator](const IrArray::Index& target_index)
@@ -3120,16 +3164,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return operand_to_generator.at(operand)(source_index);
       };
     case HloOpcode::kBroadcast:
-      return [this, hlo,
-              &operand_to_generator](const IrArray::Index& target_index)
-                 -> absl::StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        // The `dimensions` member of the broadcast instruction maps from
-        // input dimensions to output dimensions.
-        return operand_to_generator.at(operand)(
-            target_index.SourceIndexOfBroadcast(hlo->shape(), operand->shape(),
-                                                hlo->dimensions(), b_));
-      };
+      return
+          [this, hlo, &operand_to_generator](const IrArray::Index& target_index)
+              -> absl::StatusOr<llvm::Value*> {
+            const HloInstruction* operand = hlo->operand(0);
+            // The `dimensions` member of the broadcast instruction maps from
+            // input dimensions to output dimensions.
+            return operand_to_generator.at(operand)(
+                target_index.SourceIndexOfBroadcast(
+                    hlo->shape(), operand->shape(), hlo->dimensions(), b_));
+          };
     case HloOpcode::kIota:
       return [this, hlo](const IrArray::Index& target_index)
                  -> absl::StatusOr<llvm::Value*> {
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index 6d36065b70e22e..0544304103b17c 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -157,6 +157,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual absl::StatusOr<llvm::Value*> EmitCos(PrimitiveType prim_type,
                                                llvm::Value* value);
 
+  virtual absl::StatusOr<llvm::Value*> EmitCosm1(PrimitiveType prim_type,
+                                                 llvm::Value* value);
+
   virtual absl::StatusOr<llvm::Value*> EmitTan(PrimitiveType prim_type,
                                                llvm::Value* value);
 
diff --git a/third_party/xla/xla/service/gather_simplifier.cc b/third_party/xla/xla/service/gather_simplifier.cc
index 051f346bd7877e..354d26b4026a68 100644
--- a/third_party/xla/xla/service/gather_simplifier.cc
+++ b/third_party/xla/xla/service/gather_simplifier.cc
@@ -125,19 +125,19 @@ absl::StatusOr<HloInstruction*> GatherSimplifier::ExpandInstruction(
   return MaybeTranspose(result, output_perm);
 }
 
-bool GatherSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
-  auto* gather = DynCast<HloGatherInstruction>(inst);
-  if (!gather) {
-    return false;
-  }
-
+bool GatherSimplifier::IsSimplifiedGather(const HloGatherInstruction* gather) {
   auto* start_indices = gather->operands()[1];
   const auto& dims = gather->gather_dimension_numbers();
-  return start_indices->shape().rank() != 2 || dims.index_vector_dim() != 1 ||
-         !IsIdentityPermutation(dims.start_index_map()) ||
-         !dims.collapsed_slice_dims().empty() ||
-         *dims.offset_dims().begin() != 1 ||
-         *dims.offset_dims().rbegin() != dims.offset_dims().size();
+  return start_indices->shape().rank() == 2 && dims.index_vector_dim() == 1 &&
+         IsIdentityPermutation(dims.start_index_map()) &&
+         dims.collapsed_slice_dims().empty() &&
+         *dims.offset_dims().begin() == 1 &&
+         *dims.offset_dims().rbegin() == dims.offset_dims().size();
+}
+
+bool GatherSimplifier::InstructionMatchesPattern(HloInstruction* inst) {
+  auto* gather = DynCast<HloGatherInstruction>(inst);
+  return gather && !IsSimplifiedGather(gather);
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gather_simplifier.h b/third_party/xla/xla/service/gather_simplifier.h
index 6d4d70170f852c..6d2b37502cd4c0 100644
--- a/third_party/xla/xla/service/gather_simplifier.h
+++ b/third_party/xla/xla/service/gather_simplifier.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GATHER_SIMPLIFIER_H_
 #define XLA_SERVICE_GATHER_SIMPLIFIER_H_
 
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -36,6 +37,8 @@ class GatherSimplifier : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "gather_simplifier"; }
 
+  static bool IsSimplifiedGather(const HloGatherInstruction* gather);
+
  protected:
   bool InstructionMatchesPattern(HloInstruction* inst) override;
 
diff --git a/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/xla/xla/service/generic_transfer_manager.h
index bc376a6932c017..c80d89187073ee 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.h
+++ b/third_party/xla/xla/service/generic_transfer_manager.h
@@ -83,12 +83,6 @@ class GenericTransferManager : public TransferManager {
   Shape HostShapeToDeviceShape(const Shape& host_shape) const override;
 
  private:
-  // Returns whether subbyte types (types less than 1 byte, e.g. U4) should
-  // have multiple values packed into a single byte on the device. Subbyte
-  // bytes are never packed on the host. By default, returns false, so a byte
-  // can only hold one value, but subclasses can override this.
-  virtual bool PackSubbyteTypes() const { return false; }
-
   // Transfer a memory block of the given size from the device source into the
   // 'destination' buffer.
   //
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 58315facc03a8d..1c05556dfa2198 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1,20 +1,8 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
-load(
-    "//xla/service/gpu:build_defs.bzl",
-    "build_cub_sort_kernels",
-    "get_cub_sort_kernel_types",
-    "gpu_kernel_library",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_hipblaslt",
@@ -44,6 +32,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_test", "xla_cub_deps", "xla_export_hlo_deps")
+load(
+    "//xla/service/gpu:build_defs.bzl",
+    "build_cub_sort_kernels",
+    "get_cub_sort_kernel_types",
+    "gpu_kernel_library",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -185,6 +185,7 @@ xla_cc_test(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -296,10 +297,11 @@ cc_library(
         ":gpu_executable",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
-        ":nccl_collective_thunks",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:name_uniquer",
+        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -337,9 +339,7 @@ cc_library(
         ":launch_dimensions",
         ":matmul_utils",
         ":nccl_api",
-        ":nccl_collective_thunks",
         ":parallel_loop_emitter",
-        ":thunk",
         ":triton_call",
         "//xla:autotuning_proto_cc",
         "//xla:literal",
@@ -381,6 +381,7 @@ cc_library(
         "//xla/service/gpu/runtime:nccl_all_to_all_thunk",
         "//xla/service/gpu/runtime:nccl_collective_broadcast_thunk",
         "//xla/service/gpu/runtime:nccl_collective_permute_thunk",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_recv_thunk",
         "//xla/service/gpu/runtime:nccl_send_thunk",
         "//xla/service/gpu/runtime:norm_thunk",
@@ -388,6 +389,7 @@ cc_library(
         "//xla/service/gpu/runtime:replica_id_thunk",
         "//xla/service/gpu/runtime:send_recv_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/runtime:wait_for_streams_thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
@@ -527,7 +529,12 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/service:dump",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/llvm_gpu_backend",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/model:symbolic_tile_analysis",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
@@ -547,6 +554,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
@@ -590,7 +599,6 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_test.cc"]),
     backends = [
         "gpu_a100",
-        "gpu_v100",
     ],
     shard_count = 20,
     tags = [
@@ -613,6 +621,7 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/service/gpu/tests:gpu_codegen_test",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cublas_plugin",
@@ -622,12 +631,12 @@ xla_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -643,7 +652,7 @@ xla_test(
     name = "ir_emitter_triton_large_test",
     srcs = if_cuda_is_configured(["ir_emitter_triton_large_test.cc"]),
     backend_tags = {"gpu": [
-        "requires-gpu-sm70",
+        "requires-gpu-sm80",
     ]},
     backends = ["gpu"],
     tags = [
@@ -668,7 +677,6 @@ xla_test(
     srcs = if_cuda_is_configured(["ir_emitter_triton_parametrized_test.cc"]),
     backends = [
         "gpu_a100",
-        "gpu_v100",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -748,7 +756,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "//xla/tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -756,7 +764,7 @@ xla_test(
     name = "gemm_fusion_autotuner_test",
     srcs = if_cuda_is_configured(["gemm_fusion_autotuner_test.cc"]),
     backend_tags = {"gpu": [
-        "requires-gpu-sm70",
+        "requires-gpu-sm80",
     ]},
     backends = [
         "gpu",
@@ -769,6 +777,7 @@ xla_test(
         ":backend_configs_cc",
         ":gemm_fusion",
         ":gemm_fusion_autotuner",
+        ":ir_emission_utils",
         ":matmul_utils",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
@@ -858,109 +867,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "thunk",
-    srcs = ["thunk.cc"],
-    hdrs = ["thunk.h"],
-    deps = [
-        ":backend_configs_cc",
-        ":buffer_allocations",
-        ":gpu_executable_run_options",
-        ":nccl_api",
-        ":nccl_clique",
-        ":nccl_clique_key",
-        "//xla:executable_run_options",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:executable",
-        "//xla/service:global_device_id",
-        "//xla/stream_executor",
-        "//xla/translate/mhlo_to_hlo:location_exporter",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/lib/gtl:int_type",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "nccl_collective_thunks",
-    srcs = [
-        "nccl_collective_thunk.cc",
-    ],
-    hdrs = [
-        "nccl_collective_thunk.h",
-    ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW=1",
-    ]),
-    deps = [
-        ":backend_configs_cc",
-        ":buffer_allocations",
-        ":gpu_executable_run_options",
-        ":ir_emission_utils",
-        ":nccl_api",
-        ":nccl_clique",
-        ":nccl_clique_key",
-        ":thunk",
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:status_macros",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
-        "//xla/service:hlo_parser",
-        "//xla/service:rendezvous",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_headers",
-        "//xla/stream_executor/gpu:gpu_activation_header",
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/translate/hlo_to_mhlo:hlo_utils",
-        "//xla/translate/mhlo_to_hlo:attribute_exporter",
-        "//xla/translate/mhlo_to_hlo:type_to_shape",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_nccl//:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rccl",
-    ]),
-)
-
 #===-------------------------------------------------------------------------------------------===//
 # NCCL integration
 #===-------------------------------------------------------------------------------------------===//
@@ -1182,12 +1088,10 @@ cc_library(
     deps = if_cuda_is_configured([
         ":gpu_executable_run_options",
         ":mock_nccl_xml_google",
-        ":nccl_collective_thunks",
         ":nccl_api",
         ":nccl_clique_key",
         ":nccl_clique",
         ":sleep_kernel",
-        ":thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1214,7 +1118,9 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service:rendezvous",
         "//xla/service:lockable",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_stream",
@@ -1237,10 +1143,8 @@ cc_library(
     deps = if_gpu_is_configured([
         ":gpu_executable_run_options",
         ":nccl_api",
-        ":nccl_collective_thunks",
         ":nccl_clique_key",
         ":nccl_clique",
-        ":thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -1249,7 +1153,9 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/service/gpu/runtime:nccl_collective_thunk",
         "//xla/service/gpu/runtime:nccl_p2p_thunk_common",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
         "//xla/service:lockable",
@@ -1288,7 +1194,6 @@ cc_library(
         ":nccl_clique",
         ":nccl_clique_key",
         ":stream_executor_util",
-        ":thunk",
         "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
@@ -1308,6 +1213,7 @@ cc_library(
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
         "//xla/service/gpu/runtime:annotation",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -1402,6 +1308,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:ml_dtypes",
@@ -1561,23 +1468,6 @@ cc_library(
     ]),
 )
 
-xla_cc_test(
-    name = "gemm_rewriter_test",
-    srcs = ["gemm_rewriter_test.cc"],
-    deps = [
-        ":gemm_rewriter",
-        "//xla:autotuning_proto_cc",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/stream_executor:device_description",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
 cc_library(
     name = "triton_support",
     srcs = ["triton_support.cc"],
@@ -1586,8 +1476,49 @@ cc_library(
         ":variant_visitor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:instruction_fusion",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform:tensor_float_32_utils",
+    ],
+)
+
+xla_test(
+    name = "triton_support_test",
+    srcs = if_cuda_is_configured(["triton_support_test.cc"]),
+    backends = [
+        "gpu_a100",
+    ],
+    shard_count = 10,
+    tags = ["nomac"],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":gpu_float_support",
+        ":ir_emission_utils",
+        ":ir_emitter_triton",
+        ":matmul_utils",
+        ":triton_fusion_analysis",
+        ":triton_support",
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:float_normalization",
+        "//xla/service:hlo_pass_pipeline",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1613,6 +1544,17 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "triton_tiling_propagation_test",
+    srcs = ["triton_tiling_propagation_test.cc"],
+    deps = [
+        ":triton_tiling_propagation",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "triton_fusion_analysis",
     srcs = ["triton_fusion_analysis.cc"],
@@ -1636,6 +1578,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1712,8 +1655,45 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "gemv_rewriter",
+    srcs = ["gemv_rewriter.cc"],
+    hdrs = ["gemv_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "gemv_rewriter_test",
+    srcs = ["gemv_rewriter_test.cc"],
+    deps = [
+        ":gemv_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -1779,53 +1759,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "fusion_merger_triton",
-    srcs = ["fusion_merger_triton.cc"],
-    hdrs = ["fusion_merger_triton.h"],
-    deps = [
-        ":backend_configs_cc",
-        ":gpu_fusible",
-        ":ir_emission_utils",
-        ":triton_fusion_analysis",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-xla_test(
-    name = "fusion_merger_triton_test",
-    srcs = ["fusion_merger_triton_test.cc"],
-    backend_tags = {"gpu": [
-        "requires-gpu-sm70",
-    ]},
-    backends = [
-        "gpu",
-    ],
-    deps = [
-        ":fusion_merger_triton",
-        "//xla:autotune_results_proto_cc",
-        "//xla/service:pattern_matcher",
-        "//xla/service:pattern_matcher_gmock",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
-        "@com_google_absl//absl/log",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:status_matchers",
-    ],
-)
-
 cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
@@ -1893,6 +1826,7 @@ cc_library(
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/gpu:redzone_allocator",
+        "//xla/tsl/util/proto:proto_utils",
         "//xla:util",
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
@@ -1900,7 +1834,6 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -2240,7 +2173,7 @@ cc_library(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util/proto:proto_utils",
+        "//xla/tsl/util/proto:proto_utils",
     ]),
 )
 
@@ -2431,6 +2364,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -2442,6 +2376,7 @@ xla_cc_test(
     srcs = ["move_copy_to_users_test.cc"],
     deps = [
         ":move_copy_to_users",
+        "//xla/service:layout_assignment",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:test",
@@ -2779,7 +2714,6 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -3175,6 +3109,7 @@ cc_library(
     deps = [
         ":gemm_fusion",
         ":ir_emission_utils",
+        ":triton_support",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:statusor",
@@ -3228,7 +3163,6 @@ cc_library(
     srcs = if_cuda_is_configured(["cudnn_fusion_compiler.cc"]),
     hdrs = if_cuda_is_configured(["cudnn_fusion_compiler.h"]),
     deps = if_cuda_is_configured([
-        ":autotuner_util",
         ":backend_configs_cc",
         ":ir_emission_utils",
         ":kernel_reuse_cache",
@@ -3239,7 +3173,6 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@local_config_cuda//cuda:cudnn_header",
         "//xla:shape_util",
@@ -3382,7 +3315,6 @@ cc_library(
         ":ir_emitter_unnested",
         ":metrics",
         ":runtime_intrinsics",
-        ":thunk",
         "//xla:shape_util",
         "//xla:status",
         "//xla:util",
@@ -3397,6 +3329,7 @@ cc_library(
         "//xla/service:logical_buffer",
         "//xla/service/gpu/runtime:conditional_thunk",
         "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/gpu/runtime:while_thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
@@ -3433,6 +3366,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/stream_executor:device_description",
@@ -3518,7 +3452,7 @@ cc_library(
         ":hlo_traversal",
         ":ir_emission_utils",
         "//xla:shape_util",
-        "//xla:statusor",
+        "//xla:status",
         "//xla:util",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -3530,6 +3464,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -3846,13 +3781,12 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
     ]) + xla_export_hlo_deps() + [
         ":command_buffer_scheduling",
-        ":fusion_merger_triton",
         ":fusion_pipeline",
         ":ir_emitter_context",
         ":ir_emitter_unnested",
         ":prepare_hlo_for_ir_emitting_pipeline",
         ":rename_fusions",
-        ":thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:platform_manager",
         "@llvm-project//mlir:FuncDialect",
         "@local_tsl//tsl/lib/monitoring:counter",
@@ -3930,6 +3864,7 @@ xla_cc_test(
     srcs = ["auto_sharding_gpu_compiler_test.cc"],
     tags = tf_cuda_tests_tags() + ["no_oss"],  # TODO(b/277355322): Make autosharding work in OSS
     deps = [
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:gpu_plugin",
         "//xla/service:hlo_module_config",
@@ -3980,6 +3915,7 @@ cc_library(
         ":cudnn_fused_conv_rewriter",
         ":cudnn_fused_mha_rewriter",
         ":cudnn_fused_mha_transpose_fusion",
+        ":cudnn_fusion_compiler",
         ":cudnn_norm_rewriter",
         ":cudnn_pad_for_convolutions",
         ":cudnn_simplify_padding",
@@ -3999,7 +3935,6 @@ cc_library(
         ":move_copy_to_users",
         ":target_constants",
         ":triangular_solve_rewriter",
-        ":cudnn_fusion_compiler",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -4066,30 +4001,31 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
+        "//xla/tsl/util:env_var",
     ]),
 )
 
-xla_cc_test(
+xla_test(
     name = "nvptx_compiler_test",
     srcs = if_gpu_is_configured([
         "nvptx_compiler_test.cc",
     ]),
+    backends = [
+        "gpu_v100",
+        "gpu_a100",
+    ],
     tags = [
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
-        "requires-gpu-sm70",
     ],
     deps = [
-        ":gpu_compiler",
         ":nvptx_compiler_impl",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
         "//xla/service:buffer_assignment",
-        "//xla/service:gpu_plugin",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/status:statusor",
@@ -4538,6 +4474,8 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
         "//xla/stream_executor:launch_dim",
+        "//xla/tsl/util:env_var",
+        "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
@@ -4553,8 +4491,6 @@ cc_library(
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:env_var",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ],
 )
 
@@ -4565,10 +4501,10 @@ xla_cc_test(
         ":stream_executor_util",
         "//xla:autotuning_proto_cc",
         "//xla/service:hlo_module_config",
+        "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/util/proto:proto_utils",
     ],
 )
 
@@ -5970,6 +5906,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:protobuf",
     ]) + [
@@ -6022,13 +5959,15 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
+    backends = [
+        "gpu_a100",
+    ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = tf_gpu_tests_tags(),
     deps = [
         ":autotuner_util",
         "//xla:literal",
@@ -6065,13 +6004,17 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_parser",
         "//xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:errors",
     ],
@@ -6098,14 +6041,15 @@ cc_library(
     hdrs = ["stream_attribute_annotator.h"],
     deps = [
         ":backend_configs_cc",
-        ":thunk",
         "//xla:comparison_util",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
@@ -6148,7 +6092,6 @@ cc_library(
     hdrs = ["stream_attribute_async_wrapper.h"],
     deps = [
         ":backend_configs_cc",
-        ":thunk",
         "//xla:comparison_util",
         "//xla:status",
         "//xla:statusor",
@@ -6156,6 +6099,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
index ebee4e06f65d6d..5b92eb3423ebbb 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <optional>
 #include <string>
 #include <utility>
@@ -26,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -45,6 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -54,6 +57,24 @@ namespace gpu {
 
 namespace {
 
+// A dataflow path flowing from a definition to a user.
+using DefUseDataflowPath = absl::InlinedVector<HloInstruction*, 2>;
+
+// All dataflow paths flowing from a definition to all users. Each user will
+// have a separate entry in the vector.
+using DefUseDataflowPaths = absl::InlinedVector<DefUseDataflowPath, 4>;
+
+// A dataflow path flowing from a user to a definition.
+using UseDefDataflowPath = absl::InlinedVector<HloInstruction*, 4>;
+
+// All dataflow paths flowing from a user to all definitions of its operands.
+using UseDefDataflowPaths = absl::InlinedVector<HloInstruction*, 8>;
+
+using DataflowPathView = absl::Span<HloInstruction* const>;
+using DataflowPathsView = absl::Span<DataflowPathView>;
+
+using InstructionSet = absl::flat_hash_set<HloInstruction*>;
+
 bool IsNoOp(const HloInstruction* hlo) {
   return HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kTuple,
                           HloOpcode::kGetTupleElement>(hlo);
@@ -80,12 +101,12 @@ bool IsCustomCall(const HloInstruction* hlo, absl::string_view platform_name) {
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(platform_name));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> handler_registration =
       ffi::FindHandler(call_target_name, platform_name);
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && handler_registration.ok();
 
   return found_custom_call || found_ffi_handler;
 }
@@ -93,12 +114,13 @@ bool IsCustomCall(const HloInstruction* hlo, absl::string_view platform_name) {
 // Returns true if the slice is 128-byte-aligned. The slice starting
 // address is determined by the product of all non-sliced dimensions and an
 // offset defined by `slice_starts` of the slice op.
-bool IsAlignedSlice(const HloInstruction& instr) {
-  if (!IsContiguousSlice(instr)) return false;
-
-  auto slice = Cast<HloSliceInstruction>(&instr);
-  const Shape& src_shape = instr.operand(0)->shape();
-  const Shape& dst_shape = instr.shape();
+//
+// For dynamic cases, we don't have info about the start indices, so we have to
+// be conservative by only accepting sliced shapes that have the product of all
+// non-sliced dimensions being a multiple of `kXlaAllocatedBufferAlignBytes`.
+bool IsAlignedSlice(const Shape& src_shape, const Shape& dst_shape,
+                    const HloSliceInstruction* slice) {
+  if (!IsContiguousSlice(src_shape, dst_shape)) return false;
 
   auto strides = ShapeUtil::ByteStrides(dst_shape);
   if (!strides.has_value()) return false;
@@ -107,20 +129,22 @@ bool IsAlignedSlice(const HloInstruction& instr) {
     if ((strides.value()[dim] % kXlaAllocatedBufferAlignBytes) == 0)
       return true;
     if (dst_shape.dimensions(dim) < src_shape.dimensions(dim)) {
-      return ((strides.value()[dim] * slice->slice_starts(dim)) %
-                  kXlaAllocatedBufferAlignBytes ==
-              0);
+      return (slice != nullptr &&
+              ((strides.value()[dim] * slice->slice_starts(dim)) %
+                   kXlaAllocatedBufferAlignBytes ==
+               0));
     }
   }
   return true;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
-    const HloInstruction* instr) {
-  absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
-      const_cast<HloInstruction*>(instr)};
+UseDefDataflowPaths GetSlicedOperandPaths(const HloInstruction* instr) {
+  UseDefDataflowPaths sliced_operand_paths;
+
   auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
-  absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
+  // This set is used to avoid duplicates in the matched results. It contains
+  // the matched instructions that we have seen so far.
+  InstructionSet processed_instrs;
 
   const auto& aliasing_pairs =
       Cast<HloCustomCallInstruction>(instr)->output_to_operand_aliasing();
@@ -134,44 +158,126 @@ absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
     // is against the whole idea of address computation fusion. Skip this
     // operand.
     if (aliased_operands.contains(instr->operand_index(operand))) continue;
-    absl::InlinedVector<HloInstruction*, 4> maybe_sliced_operand_chain;
+    UseDefDataflowPath maybe_sliced_operand_path;
+    bool slice_found = false;
+    // TODO: currently HloFindIf exits upon encountering the first node that
+    // matches. This works well if each operand only has 1 data flow (i.e. only
+    // flows through unary op). We might want to keep finding until the queue is
+    // empty: if the operand is a tuple, it might have different data flows
+    // (i.e. 1 for each element).
     auto maybe_slice_adaptor =
         HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
           const HloInstruction* cur = &node.instruction();
-          if (processed_sliced_chain_set.contains(cur)) return true;
-          maybe_sliced_operand_chain.push_back(
-              const_cast<HloInstruction*>(cur));
+
+          // If the node is a match that has been processed, stop the traversal.
+          if (processed_instrs.contains(cur)) return true;
+
+          maybe_sliced_operand_path.push_back(const_cast<HloInstruction*>(cur));
+
+          if (IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(
+                  node)) {
+            if (IsAlignedSlice(cur->operand(0)->shape(), cur->shape(),
+                               DynCast<HloSliceInstruction>(cur))) {
+              slice_found = true;
+              return slice_found;
+            }
+          }
+
           // TODO(vuson): lift the first restriction by considering fusing other
           // uses of the operand to reuse the address computation. Only worth it
           // if other uses are also custom calls though.
-          // TODO(vuson): lift the second restriction by considering fusing the
-          // non-noop instructions to the computation if possible.
-          return cur->user_count() > 1 || !IsNoOp(cur) || IsAlignedSlice(*cur);
+          return cur->user_count() > 1 || !IsNoOp(cur);
         });
+
     if (maybe_slice_adaptor == std::nullopt) continue;
+
     const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
-    if (IsAlignedSlice(maybe_slice_instr) ||
-        processed_sliced_chain_set.contains(&maybe_slice_instr)) {
-      sliced_operand_chains.insert(sliced_operand_chains.end(),
-                                   maybe_sliced_operand_chain.begin(),
-                                   maybe_sliced_operand_chain.end());
-      processed_sliced_chain_set.insert(maybe_sliced_operand_chain.begin(),
-                                        maybe_sliced_operand_chain.end());
+
+    if (slice_found || processed_instrs.contains(&maybe_slice_instr)) {
+      // Even in the case of stopping at a match that has been processed, we
+      // still need to add instructions encountered in the sliced operand path
+      // during the latest traversal.
+      sliced_operand_paths.insert(sliced_operand_paths.end(),
+                                  maybe_sliced_operand_path.rbegin(),
+                                  maybe_sliced_operand_path.rend());
+      processed_instrs.insert(maybe_sliced_operand_path.begin(),
+                              maybe_sliced_operand_path.end());
+    }
+  }
+
+  sliced_operand_paths.push_back(const_cast<HloInstruction*>(instr));
+  return sliced_operand_paths;
+}
+
+// Each user of `instr` that goes into a DUS will have an entry in the returned
+// vector.
+// Each entry contains the sliced paths for that user, i.e. the sequence of ops
+// following the dataflow from the user itself to the DUS (included).
+DefUseDataflowPaths GetSlicedUserPaths(const HloInstruction* instr) {
+  DefUseDataflowPaths sliced_user_paths;
+  auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
+  // This set is used to avoid duplicates in the matched results. It contains
+  // the matched instructions that we have seen so far.
+  InstructionSet processed_instrs;
+
+  auto traverse_hlo_and_collect = [&](HloInstruction* start) {
+    DefUseDataflowPath maybe_sliced_user_path;
+    bool dus_found = false;
+    auto maybe_dus_adaptor = HloFindIf(
+        {HloInstructionAdaptor(*start)}, *fusion,
+        [&](auto node) {
+          const HloInstruction* cur = &node.instruction();
+          // If the node is a match that has been processed, stop the
+          // traversal.
+          if (processed_instrs.contains(cur)) return true;
+          maybe_sliced_user_path.push_back(const_cast<HloInstruction*>(cur));
+          if (const auto slice_instr =
+                  DynCast<HloDynamicUpdateSliceInstruction>(cur)) {
+            if (IsAlignedSlice(slice_instr->shape(),
+                               slice_instr->update()->shape(), nullptr)) {
+              dus_found = true;
+              return true;
+            }
+          }
+          return cur->user_count() > 1 || !IsNoOp(cur);
+        },
+        /*visit_operands=*/false);
+    if (maybe_dus_adaptor == std::nullopt) return;
+    const auto& maybe_dus_instr = maybe_dus_adaptor->instruction();
+    if (dus_found || processed_instrs.contains(&maybe_dus_instr)) {
+      // Even in the case of stopping at a match that has been processed, we
+      // still need to add instructions encountered in the sliced user path
+      // during the latest traversal.
+      processed_instrs.insert(maybe_sliced_user_path.begin(),
+                              maybe_sliced_user_path.end());
+      sliced_user_paths.push_back(std::move(maybe_sliced_user_path));
+    }
+  };
+
+  if (instr->shape().IsTuple()) {
+    for (auto* user : instr->users()) {
+      if (DynCast<HloGetTupleElementInstruction>(user)) {
+        traverse_hlo_and_collect(user);
+      }
+    }
+  } else {
+    if (instr->user_count() == 1) {
+      traverse_hlo_and_collect(instr->users().front());
     }
   }
-  return sliced_operand_chains;
+
+  return sliced_user_paths;
 }
 
 absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
-    absl::Span<HloInstruction* const> matched) {
+    DataflowPathView matches) {
   absl::InlinedVector<HloInstruction*, 4> captures;
 
-  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
-                                                        matched.end());
+  InstructionSet matched_instrs(matches.begin(), matches.end());
 
-  for (HloInstruction* instr : matched) {
+  for (HloInstruction* instr : matches) {
     for (HloInstruction* operand : instr->operands()) {
-      if (!instructions_set.contains(operand) &&
+      if (!matched_instrs.contains(operand) &&
           absl::c_find(captures, operand) == captures.end()) {
         captures.emplace_back(operand);
       }
@@ -181,51 +287,43 @@ absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
   return captures;
 }
 
-absl::InlinedVector<HloInstruction*, 8> GetSortedMatched(
-    absl::Span<HloInstruction* const> matched) {
-  absl::InlinedVector<HloInstruction*, 8> sorted_matched;
-  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
-                                                        matched.end());
-  absl::flat_hash_set<HloInstruction*> processed_set;
-  // Topologically sort `matched`
-  for (auto it = matched.rbegin(); it != matched.rend(); ++it) {
-    if (processed_set.contains(*it)) continue;
-    for (auto* operand : (*it)->operands()) {
-      if (!instructions_set.contains(operand)) {
-        continue;
-      }
-      if (!processed_set.contains(operand)) {
-        sorted_matched.emplace_back(operand);
-        processed_set.insert(operand);
-      }
-    }
-    sorted_matched.emplace_back(*it);
-    processed_set.insert(*it);
-  }
+Status CreateRootTuple(HloInstruction* hero, HloComputation::Builder& builder,
+                       DataflowPathsView sliced_user_paths,
+                       absl::flat_hash_map<const HloInstruction*,
+                                           HloInstruction*>& instr_mapping) {
+  unsigned tuple_size = hero->shape().tuple_shapes_size();
 
-  return sorted_matched;
-}
+  std::vector<HloInstruction*> sliced_elems(tuple_size, nullptr);
+  for (auto& sliced_user_path : sliced_user_paths) {
+    auto gte = Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
+    sliced_elems[gte->tuple_index()] = sliced_user_path.back();
+  }
 
-void CreateRootTuple(HloInstruction* root, HloComputation::Builder& builder) {
   std::vector<HloInstruction*> elements;
-  elements.reserve(root->shape().tuple_shapes_size());
-  for (size_t i = 0; i < root->shape().tuple_shapes_size(); ++i) {
-    if (root->shape().tuple_shapes(i).IsTuple()) {
-      HloInstruction* gte = builder.AddInstruction(
-          HloInstruction::CreateGetTupleElement(root, i));
-      CreateRootTuple(gte, builder);
+  for (size_t i = 0; i < tuple_size; ++i) {
+    if (sliced_elems[i] != nullptr) {
+      elements.push_back(instr_mapping[sliced_elems[i]]);
+      continue;
+    }
+    auto* gte = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(instr_mapping[hero], i));
+    if (hero->shape().tuple_shapes(i).IsTuple()) {
+      instr_mapping[gte] = gte;
+      TF_RETURN_IF_ERROR(CreateRootTuple(gte, builder, {}, instr_mapping));
       elements.push_back(builder.last_added_instruction());
     } else {
-      elements.push_back(builder.AddInstruction(
-          HloInstruction::CreateGetTupleElement(root, i)));
+      elements.push_back(gte);
     }
   }
-  builder.AddInstruction(HloInstruction::CreateTuple(elements));
+  if (elements.size() > 1)
+    builder.AddInstruction(HloInstruction::CreateTuple(elements));
+
+  return absl::OkStatus();
 }
 
 absl::StatusOr<HloComputation*> CreateFusionBody(
-    HloModule* module, absl::Span<HloInstruction* const> matched,
-    absl::Span<HloInstruction* const> captures) {
+    HloModule* module, DataflowPathView sliced_operand_paths,
+    DataflowPathsView sliced_user_paths, DataflowPathView captures) {
   HloComputation::Builder builder("address-computation");
 
   // A mapping from original instructions to instructions in the fusion body.
@@ -249,25 +347,34 @@ absl::StatusOr<HloComputation*> CreateFusionBody(
   }
 
   // Instructions in the pattern are already topologically sorted, as we visited
-  // them following use-def chain, then reverse the list.
-  for (HloInstruction* instr : matched) {
+  // them following use-def path, then reverse the list.
+  HloInstruction* hero;
+  for (HloInstruction* instr : sliced_operand_paths) {
     instr_mapping[instr] = builder.AddInstruction(
         instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+    hero = instr;
+  }
+
+  for (auto& sliced_user_path : sliced_user_paths) {
+    for (HloInstruction* instr : sliced_user_path) {
+      instr_mapping[instr] = builder.AddInstruction(
+          instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+    }
   }
 
-  HloInstruction* root = builder.last_added_instruction();
-  // Create a root tuple if the root is a tuple to make sure there's a buffer
+  // Create a tuple if the hero is a tuple to make sure there's a buffer
   // assigned for each of the elements. Make sure the tuple is not nil first.
-  if (root->shape().IsTuple() && root->shape().tuple_shapes_size() > 0) {
-    CreateRootTuple(root, builder);
+  if (hero->shape().IsTuple() && hero->shape().tuple_shapes_size() > 0) {
+    TF_RETURN_IF_ERROR(
+        CreateRootTuple(hero, builder, sliced_user_paths, instr_mapping));
   }
 
   return module->AddComputationAndUnifyNamesAndIds(builder.Build(), false);
 }
 
 absl::StatusOr<HloInstruction*> CreateFusionInstruction(
-    HloModule* module, HloInstruction* orig,
-    absl::Span<HloInstruction* const> captures, HloComputation* body) {
+    HloModule* module, HloInstruction* orig, DataflowPathView captures,
+    HloComputation* body, bool dynamic) {
   HloComputation* parent = orig->parent();
 
   // Add a fusion operation calling outlined fusion computation.
@@ -285,7 +392,8 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
       *gpu_config.mutable_fusion_backend_config();
   backend_config.set_kind("__custom_fusion");
   CustomFusionConfig config;
-  config.set_name("address_computation");
+  config.set_name(dynamic ? "dynamic_address_computation"
+                          : "address_computation");
   *backend_config.mutable_custom_fusion_config() = config;
   TF_RETURN_IF_ERROR(fusion->set_backend_config(std::move(gpu_config)));
 
@@ -297,57 +405,110 @@ absl::StatusOr<HloInstruction*> CreateFusionInstruction(
 absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  if (!module->has_schedule()) return Internal("module is not scheduled");
-  bool changed = false;
-
-  absl::flat_hash_map<HloInstruction*, absl::InlinedVector<HloInstruction*, 8>>
+  absl::flat_hash_map<HloInstruction*,
+                      std::pair<UseDefDataflowPaths, DefUseDataflowPaths>>
       matches;
 
   // Collect all potential custom call matches in the non-fusion computations.
   for (HloComputation* computation : module->computations()) {
     if (computation->IsFusionComputation()) continue;
     for (HloInstruction* instr : computation->instructions()) {
-      if (IsLegacyCublasMatmul(*instr) || IsCustomCall(instr, platform_name_)) {
-        auto sliced_operand_chains = GetSlicedOperandChains(instr);
-        if (!(sliced_operand_chains.size() == 1 &&
-              sliced_operand_chains.front() == instr)) {
-          matches[instr] = std::move(sliced_operand_chains);
+      if (IsLegacyCublasMatmul(*instr) ||
+          (IsCustomCall(instr, platform_name_))) {
+        UseDefDataflowPaths sliced_operand_paths = GetSlicedOperandPaths(instr);
+        bool has_sliced_operand_paths = sliced_operand_paths.size() > 1;
+
+        DefUseDataflowPaths sliced_user_paths = GetSlicedUserPaths(instr);
+        bool has_sliced_user_paths = absl::c_any_of(
+            sliced_user_paths,
+            [&](auto& sliced_user_path) { return !sliced_user_path.empty(); });
+
+        if (absl::c_any_of(sliced_user_paths, [&](auto& sliced_user_path) {
+              return DynCast<HloDynamicUpdateSliceInstruction>(
+                         sliced_user_path.back()) == nullptr;
+            })) {
+          return absl::InternalError(
+              "Expect sliced user path to end with a DUS.");
+        }
+
+        if (has_sliced_operand_paths || has_sliced_user_paths) {
+          matches[instr] = std::make_pair(std::move(sliced_operand_paths),
+                                          std::move(sliced_user_paths));
         }
       }
     }
   }
 
-  HloSchedule& schedule = module->schedule();
-  for (auto& kv : matches) {
-    auto captures = GetPatternCaptures(kv.second);
-    auto sorted = GetSortedMatched(kv.second);
+  if (matches.empty()) return false;
 
-    TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
-                        CreateFusionBody(module, sorted, captures));
-    TF_ASSIGN_OR_RETURN(
-        HloInstruction * fusion,
-        CreateFusionInstruction(module, kv.first, captures, fusion_body));
+  for (auto& [hero, paths] : matches) {
+    auto& [sliced_operand_paths, sliced_user_paths] = paths;
+    std::vector<HloInstruction*> matched_instrs;
+    absl::c_copy(sliced_operand_paths, std::back_inserter(matched_instrs));
 
-    // As we are running after scheduling we have to keep it valid.
-    HloComputation* parent = kv.first->parent();
+    std::vector<DataflowPathView> sliced_user_paths_view;
+    for (auto& sliced_user_path : sliced_user_paths) {
+      absl::c_copy(sliced_user_path, std::back_inserter(matched_instrs));
+      DataflowPathView sliced_user_path_view{&sliced_user_path.front(),
+                                             sliced_user_path.size()};
+      sliced_user_paths_view.push_back(std::move(sliced_user_path_view));
+    }
 
-    // Update schedule to replace the custom call instruction with the fusion
-    // instruction.
-    // Removal of the rest of the instructions in the sequence is handled by
-    // schedule update below.
-    HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
-    sequence.replace_instruction(kv.first, fusion);
+    auto captures = GetPatternCaptures(matched_instrs);
 
-    // TODO(vuson): handle control dependencies
-    TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
-    changed = true;
-  }
+    TF_ASSIGN_OR_RETURN(
+        HloComputation * fusion_body,
+        CreateFusionBody(module, sliced_operand_paths,
+                         DataflowPathsView(sliced_user_paths_view), captures));
 
-  if (changed) {
-    TF_RETURN_IF_ERROR(module->schedule().Update());
+    bool has_dynamic_slices = absl::c_any_of(matched_instrs, [&](auto* instr) {
+      return DynCast<HloDynamicIndexInstruction>(instr) != nullptr;
+    });
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fusion,
+        CreateFusionInstruction(module, hero, captures, fusion_body,
+                                has_dynamic_slices));
+
+    HloComputation* parent = hero->parent();
+    if (fusion->shape().IsTuple()) {
+      TF_RETURN_IF_ERROR(parent->ReplaceInstructionWithDifferentShape(
+          const_cast<HloInstruction*>(hero), fusion));
+      for (auto& sliced_user_path : sliced_user_paths) {
+        auto old_gte =
+            Cast<HloGetTupleElementInstruction>(sliced_user_path.front());
+        HloInstruction* gte =
+            parent->AddInstruction(HloInstruction::CreateGetTupleElement(
+                fusion, old_gte->tuple_index()));
+        TF_RETURN_IF_ERROR(
+            parent->ReplaceInstruction(sliced_user_path.back(), gte));
+      }
+    } else {
+      auto* instr_to_be_replaced = const_cast<HloInstruction*>(hero);
+      if (sliced_user_paths.empty()) {
+        // The only case where a tuple-shaped original hero op is fused into a
+        // non-tuple-shaped fusion is there's only one element of the original
+        // tuple being used. In that case, we need to replace that single
+        // get-tuple-element (instead of the hero op) with the fusion
+        // instruction.
+        if (hero->shape().IsTuple()) {
+          if (hero->user_count() != 1 ||
+              !DynCast<HloGetTupleElementInstruction>(hero->users().front())) {
+            return absl::InternalError(
+                "Expect a single get-tuple-element user of the original "
+                "tuple-shaped hero op when address computation fusion does "
+                "not return a tuple");
+          }
+          instr_to_be_replaced = hero->users().front();
+        }
+      } else {
+        instr_to_be_replaced = sliced_user_paths.front().back();
+      }
+      TF_RETURN_IF_ERROR(
+          parent->ReplaceInstruction(instr_to_be_replaced, fusion));
+    }
   }
 
-  return changed;
+  return true;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
index 0bf2040567d333..4d14024115a621 100644
--- a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -51,7 +51,7 @@ class AddressComputationFusionRewriterTest : public HloTestBase {};
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemm) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -107,15 +107,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemm) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -175,15 +172,79 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -241,23 +302,20 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest,
        SimpleGemmOperandHasMultipleUsers) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
-      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %p1 = f16[4,8,8]{2,1,0} parameter(1)
       %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
       %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
-      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[2:3], [0:8], [0:8]}
       %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
       %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
@@ -285,41 +343,38 @@ TEST_F(AddressComputationFusionRewriterTest,
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
-    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[2:3], [0:8], [0:8]}
     ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
     ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[B1]]),
     ; CHECK:              custom_call_target="__cublas$gemm"
     ; CHECK:     }
 
     ; CHECK:     ENTRY %main{{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
-    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
-    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
-    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
-    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[B0]], [[P1]])
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(1)
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[P0]], [[B1]])
     ; CHECK:         kind=kCustom, calls=%address-computation,
     ; CHECK:         backend_config={
     ; CHECK:           "kind":"__custom_fusion",
     ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
     ; CHECK:         }
-    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B0]])
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B1]])
     ; CHECK:     }
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest,
        SimpleGemmOperandsHaveMultipleUsers) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -349,7 +404,7 @@ TEST_F(AddressComputationFusionRewriterTest,
           "grad_y":false
         }}
 
-      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
         custom_call_target="__cublas$gemm",
         backend_config={"gemm_backend_config":{
           "alpha_real":1,
@@ -378,7 +433,7 @@ TEST_F(AddressComputationFusionRewriterTest,
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[4,8,8]{2,1,0} parameter(0)
@@ -440,15 +495,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotContiguousSlice) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -487,7 +539,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotContiguousSlice) {
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNonNoOpInSliceChain) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -530,7 +582,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNonNoOpInSliceChain) {
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmDuplicateOperand) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main {
       %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
@@ -604,15 +656,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmDuplicateOperand) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(1)
@@ -670,15 +719,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder2) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -736,15 +782,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder2) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
@@ -778,11 +821,11 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
-    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[100,100]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[100,100]{1,0} parameter(2)
     ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[100,100]{1,0} parameter(1)
-    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[200,100]{1,0} parameter(2)
-    ; CHECK-DAG:   [[S1:%[^ ]+]] = f32[100,100]{1,0} slice([[P2]]), slice={[16:116], [0:100]}
-    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) custom-call([[P0]], [[S1]], [[P1]]),
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[200,100]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f32[100,100]{1,0} slice([[P0]]), slice={[16:116], [0:100]}
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) custom-call([[P1]], [[S1]], [[P2]]),
     ; CHECK:         custom_call_target="__cublas$gemm"
     ; CHECK:     }
 
@@ -792,7 +835,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
     ; CHECK:       [[GTE1:%[^ ]+]] = f32[100,100]{1,0} get-tuple-element([[P]]), index=1
     ; CHECK:       [[CONCAT:%[^ ]+]] = f32[200,100]{1,0} concatenate([[GTE0]], [[GTE1]]), dimensions={0}
     ; CHECK:       [[S:%[^ ]+]] = f32[100,100]{1,0} slice([[CONCAT]]), slice={[99:199], [0:100]}
-    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) fusion([[GTE0]], [[S]], [[CONCAT]])
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) fusion([[CONCAT]], [[GTE0]], [[S]])
     ; CHECK:         kind=kCustom, calls=%address-computation,
     ; CHECK:         backend_config={
     ; CHECK:           "kind":"__custom_fusion",
@@ -803,15 +846,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
   const char* hlo = R"(
-    HloModule test, is_scheduled=true
+    HloModule test
 
     ENTRY %main.9 {
       %p0 = f16[2,8,8]{2,1,0} parameter(0)
@@ -864,10 +904,7 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+                            expected);
 }
 
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
@@ -907,12 +944,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCall) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -936,12 +973,8 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCall) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 void Callback_Void(se::gpu::GpuStreamHandle stream, void** buffers,
@@ -965,12 +998,12 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   const char* expected = R"(
     ; CHECK:     %address-computation {{.*}} {
@@ -993,12 +1026,153 @@ TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
   )";
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  RunAndFilecheckHloRewrite(hlo->ToString(),
-                            AddressComputationFusionRewriter(PLATFORM),
-                            expected, [](HloModule* module) {
-                              EXPECT_TRUE(module->has_schedule());
-                              TF_CHECK_OK(module->schedule().Verify());
-                            });
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, TupleSliceCustomCallLegacy) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "Callback_Void",
+      /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
+                          {0, 0}, {4, 8}, {1, 1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+      },
+      ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P0]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
+    ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(2)
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[T0]], [[P2]]),
+    ; CHECK:              custom_call_target="Callback_Void"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f32[128]{0} fusion(
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, TupledOutputCustomCallLegacy) {
+  XlaBuilder b(TestName());
+  auto custom_call = CustomCall(
+      &b, "Callback_Void",
+      /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
+                          {0, 0}, {4, 8}, {1, 1}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+      }),
+      /*opaque=*/"");
+  Tuple(&b, {GetTupleElement(GetTupleElement(custom_call, 1), 0),
+             GetTupleElement(custom_call, 2)});
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = (f32[1024]{0}, f32[8]{0}) parameter(2)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[256]{0} parameter(1)
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f32[4,8]{1,0} slice([[P0]]), slice={[0:4], [0:8]}
+    ; CHECK-DAG:   [[T0:%[^ ]+]] = (f32[4,8]{1,0}, f32[256]{0}) tuple([[S0]], [[P1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) custom-call([[T0]], [[P2]]),
+    ; CHECK:              custom_call_target="Callback_Void"
+    ; CHECK-DAG:   [[GTE0:%[^ ]+]] = f32[8]{0} get-tuple-element([[CC]]), index=0
+    ; CHECK-DAG:   [[GTE1:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) get-tuple-element([[CC]]), index=1
+    ; CHECK-DAG:   [[GTE2:%[^ ]+]] = f32[128]{0} get-tuple-element([[GTE1]]), index=0
+    ; CHECK-DAG:   [[GTE3:%[^ ]+]] = f32[256]{0} get-tuple-element([[GTE1]]), index=1
+    ; CHECK-DAG:   [[T1:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) tuple([[GTE2]], [[GTE3]])
+    ; CHECK-DAG:   [[GTE4:%[^ ]+]] = f32[1024]{0} get-tuple-element([[CC]]), index=2
+    ; CHECK-DAG:   [[GTE5:%[^ ]+]] = f32[4,8]{1,0} get-tuple-element([[CC]]), index=3
+    ; CHECK:       ROOT {{.*}} = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) tuple([[GTE0]], [[T1]], [[GTE4]], [[GTE5]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f32[8]{0}, (f32[128]{0}, f32[256]{0}), f32[1024]{0}, f32[4,8]{1,0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK-DAG:   [[GTE6:%[^ ]+]] = f32[1024]{0} get-tuple-element([[FUSION]]), index=2
+    ; CHECK-DAG:   [[GTE7:%[^ ]+]] = (f32[128]{0}, f32[256]{0}) get-tuple-element([[FUSION]]), index=1
+    ; CHECK-DAG:   [[GTE8:%[^ ]+]] = f32[128]{0} get-tuple-element([[GTE7]]), index=0
+    ; CHECK:       ROOT {{.*}} = (f32[128]{0}, f32[1024]{0}) tuple([[GTE8]], [[GTE6]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(
+      hlo->ToString(), AddressComputationFusionRewriter(PLATFORM), expected);
 }
 
 TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
@@ -1017,12 +1191,12 @@ TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
   hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
                                         computation.proto(), hlo_config));
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
-        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
-      }));
-  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+  // TF_ASSERT_OK_AND_ASSIGN(
+  //     HloSchedule schedule,
+  //     ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+  //       return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  //     }));
+  // TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
   RunAndFilecheckHloRewrite(hlo->ToString(),
@@ -1030,4 +1204,566 @@ TEST_F(AddressComputationFusionRewriterTest, UnalignedSlice) {
                             std::nullopt);
 }
 
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemm) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      ROOT custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmWithWorkspace) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      ROOT custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest,
+       DynamicSimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(custom-call.1), index=0
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DynamicSimpleGemmNotRoot) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT res = f16[8,8]{1,0} add(custom-call.1, custom-call.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemm) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[1,8,8]{2,1,0} parameter(0)
+      p1 = f16[1,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      bitcast.41 = f16[8,8]{1,0} bitcast(p0)
+      bitcast.42 = f16[8,8]{1,0} bitcast(p1)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(custom-call.1)
+      ROOT dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(2)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(3)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(4)
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[CC]])
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[4,8,8]{2,1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmNotRoot) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = f16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      bitcast.43 = f16[1,8,8]{2,1,0} bitcast(custom-call.1)
+      dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+      ROOT res = f16[4,8,8]{2,1,0} log(dus)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(4)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[CC]])
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[4,8,8]{2,1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[4,8,8]{2,1,0} log([[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWithWorkspace) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY main.9 {
+      p0 = f16[2,8,8]{2,1,0} parameter(0)
+      p1 = f16[2,8,8]{2,1,0} parameter(1)
+      p2 = f16[4,8,8]{2,1,0} parameter(2)
+      c1_s32 = s32[] constant(1)
+      c0_s32 = s32[] constant(0)
+      slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.41 = f16[8,8]{1,0} bitcast(slice.13)
+      slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+      bitcast.42 = f16[8,8]{1,0} bitcast(slice.14)
+
+      custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(bitcast.41, bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+
+    get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(custom-call.1), index=0
+    bitcast.43 = f16[1,8,8]{2,1,0} bitcast(get-tuple-element.0)
+    dus = f16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+    get-tuple-element.1 = s8[256]{0} get-tuple-element(custom-call.1), index=1
+    ROOT tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(dus, get-tuple-element.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(3)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(4)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(1)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(2)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P0]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} dynamic-slice([[P1]], [[C1]], [[C0]], [[C0]]), dynamic_slice_sizes={1,8,8}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[DOT]])
+    ; CHECK:       [[DUS:%[^ ]+]] = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       [[DUS_MAIN:%[^ ]+]] = f16[4,8,8]{2,1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:       [[WORKSPACE_MAIN:%[^ ]+]] = s8[256]{0} get-tuple-element([[FUSION]]), index=1
+    ; CHECK:       ROOT {{.*}} = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS_MAIN]], [[WORKSPACE_MAIN]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, DUSSimpleGemmWorkspaceIgnored) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main.9 {
+      %p0 = f16[8,8]{1,0} parameter(0)
+      %p1 = f16[8,8]{1,0} parameter(1)
+      %p2 = f16[4,8,8]{2,1,0} parameter(2)
+      %c1_s32 = s32[] constant(1)
+      %c0_s32 = s32[] constant(0)
+
+      %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+      %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+      ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    })";
+
+  const char* expected = R"(
+    ; CHECK:     address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[8,8]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(2)
+    ; CHECK-DAG:   [[C1:%[^ ]+]] = s32[] parameter(3)
+    ; CHECK-DAG:   [[C0:%[^ ]+]] = s32[] parameter(4)
+    ; CHECK-DAG:   [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[P0]], [[P1]]),
+    ; CHECK-DAG:          custom_call_target="__cublas$gemm"
+    ; CHECK-DAG:   [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK-DAG:   [[BC:%[^ ]+]] = f16[1,8,8]{2,1,0} bitcast([[DOT]])
+    ; CHECK-DAG:   [[DUS:%[^ ]+]] = f16[4,8,8]{2,1,0} dynamic-update-slice([[P2]], [[BC]], [[C1]], [[C0]], [[C0]])
+    ; CHECK-DAG:   [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DUS]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"dynamic_address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT [[DOT_MAIN:%[^ ]+]] = f16[4,8,8]{2,1,0} get-tuple-element([[FUSION]]), index=0
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected);
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 723585608b5c55..f429e20f27d1ac 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/service/call_inliner.h"
 #include "xla/service/convert_mover.h"
 #include "xla/service/dot_dimension_merger.h"
+#include "xla/service/float_normalization.h"
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
 #include "xla/service/gpu/cublas_pad_for_gemms.h"
@@ -61,6 +62,34 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+
+struct ConvBfloat16Support : public FloatSupport {
+  explicit ConvBfloat16Support(const se::RocmComputeCapability& rocm)
+      : FloatSupport(BF16),
+        // TODO: MIOpen does not support bf16 convolutions yet
+        is_conv_bf16_supported_(rocm.has_bf16_dtype_support()) {}
+
+  bool SupportsLowPrecisionOperand(const HloInstruction& hlo,
+                                   int64_t operand_index) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+  bool SupportsLowPrecisionOutput(const HloInstruction& hlo) const override {
+    return (hlo.opcode() != HloOpcode::kConvolution) || is_conv_bf16_supported_;
+  }
+
+  bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
+    // Skip all HLOs other than convolutions.
+    return (hlo.opcode() != HloOpcode::kConvolution);
+  }
+
+ private:
+  bool is_conv_bf16_supported_;
+};
+
+}  // namespace
+
 absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
     HloModule* hlo_module, se::GpuComputeCapability gpu_version,
     se::dnn::VersionInfo dnn_version,
@@ -71,6 +100,12 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddInvariantCheckerDebug<HloVerifier>(
       /*layout_sensitive=*/false,
       /*allow_mixed_precision=*/false);
+
+  // Convert unsupported bf16 convolutions to f32.
+  ConvBfloat16Support conv_bf16_support(
+      std::get<se::RocmComputeCapability>(gpu_version));
+  pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
+
   pipeline.AddPass<GpusolverRewriter>();
   pipeline.AddPass<GpuConvRewriter>();
   pipeline.AddPass<GpuConvPaddingLegalization>();
diff --git a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
index 06928aa44a08b1..eab4b0d48e5dbb 100644
--- a/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/auto_sharding_gpu_compiler_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/logging.h"
 
@@ -30,6 +32,8 @@ namespace {
 
 namespace m = ::xla::match;
 
+using ::testing::Conditional;
+
 class AutoShardingTest : public HloTestBase {
  protected:
   const char* const dot_hlo_string_ = R"(
@@ -60,14 +64,35 @@ ENTRY matmul {
 };
 
 TEST_F(AutoShardingTest, MatMulWithAutosharding) {
-  auto compiled_module = CompileMatMul(true, 4);
-  auto* instruction =
+  std::unique_ptr<HloModule> compiled_module = CompileMatMul(true, 4);
+  const HloInstruction* parameter1 =
       compiled_module->entry_computation()->parameter_instruction(0);
-  VLOG(2) << instruction->ToString();
+  const HloInstruction* parameter2 =
+      compiled_module->entry_computation()->parameter_instruction(1);
+  bool is_parameter1_replicated = ShapeUtil::Equal(
+      parameter1->shape(), ShapeUtil::MakeShape(PrimitiveType::F32, {32, 64}));
+  bool is_parameter2_replicated = ShapeUtil::Equal(
+      parameter2->shape(), ShapeUtil::MakeShape(PrimitiveType::F32, {64, 128}));
+
+  // Check that at least one of the parameters is sharded, thereby telling us
+  // that the dot is as well.
+  VLOG(2) << parameter1->ToString();
+  EXPECT_THAT(
+      parameter1,
+      Conditional(
+          is_parameter2_replicated,
+          AnyOf(GmockMatch(m::Op().WithShape(PrimitiveType::F32, {8, 64})),
+                GmockMatch(m::Op().WithShape(PrimitiveType::F32, {32, 16}))),
+          GmockMatch(m::Op().WithShape(PrimitiveType::F32, {32, 64}))));
+
+  VLOG(2) << parameter2->ToString();
   EXPECT_THAT(
-      instruction,
-      AnyOf(GmockMatch(m::Op().WithSharding("{devices=[1,4]0,1,2,3}")),
-            GmockMatch(m::Op().WithSharding("{devices=[4,1]0,1,2,3}"))));
+      parameter2,
+      Conditional(
+          is_parameter1_replicated,
+          AnyOf(GmockMatch(m::Op().WithShape(PrimitiveType::F32, {16, 128})),
+                GmockMatch(m::Op().WithShape(PrimitiveType::F32, {64, 32}))),
+          GmockMatch(m::Op().WithShape(PrimitiveType::F32, {64, 128}))));
 }
 
 TEST_F(AutoShardingTest, MatMulWithoutAutosharding) {
diff --git a/third_party/xla/xla/service/gpu/autotuner_util_test.cc b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
index b755334876cc26..28ec27c64e8da0 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"   // IWYU pragma: keep
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
 
 namespace xla {
 namespace gpu {
@@ -45,7 +46,7 @@ using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::TempDir;
-using ::testing::status::StatusIs;
+using ::tsl::testing::StatusIs;
 
 class AutotunerUtilTest : public HloTestBase {
  protected:
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 39d1afcaee03fb..61335f07f0497a 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -151,7 +151,6 @@ message CustomFusionConfig {
 
 message CuDnnFusionConfig {
   int64 plan_id = 1;
-  optional string serialized_graph = 2;
 }
 
 message FusionBackendConfig {
@@ -260,4 +259,16 @@ message GpuBackendConfig {
 
     CudnnfMHABackendConfig cudnn_fmha_backend_config = 9;
   }
+
+  // This attribute instructs the latency-hiding scheduler to
+  // schedule this particular instruction to the earliest position.
+  // Note that setting this to true will make this instruction scheduled
+  // at the very beginning of the parent computation before
+  // every other nodes.
+  // An example use case would be deciding to schedule between collective
+  // or an async compute. LHS might put either one at the first place
+  // depending on the cost, but it'd be more beneficial if the collective
+  // is always scheduled first as it's not SM-heavy.
+  // In this case we can use this flag to enforce the ordering.
+  bool force_earliest_schedule = 10;
 }
diff --git a/third_party/xla/xla/service/gpu/backend_configs_test.cc b/third_party/xla/xla/service/gpu/backend_configs_test.cc
index 8992257bb8a6bb..7a02490907055c 100644
--- a/third_party/xla/xla/service/gpu/backend_configs_test.cc
+++ b/third_party/xla/xla/service/gpu/backend_configs_test.cc
@@ -133,7 +133,8 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetOpQueue) {
   gpu_backend_config.set_operation_queue_id(2);
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
   EXPECT_EQ(add->raw_backend_config_string(),
-            "{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[]}");
+            "{\"operation_queue_id\":\"2\",\"wait_on_operation_queues\":[],"
+            "\"force_earliest_schedule\":false}");
 }
 
 TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
@@ -160,7 +161,7 @@ TEST_F(BackendConfigsTest, DefaultGpuBackendConfigSetWaitOnQueue) {
   EXPECT_THAT(add->set_backend_config(gpu_backend_config), IsOk());
   EXPECT_EQ(add->raw_backend_config_string(),
             "{\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[\"0\","
-            "\"1\"]}");
+            "\"1\"],\"force_earliest_schedule\":false}");
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig config,
                           add->backend_config<GpuBackendConfig>());
 }
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
index 57168b9c8e897d..8edf682481baf9 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/service/gpu/collective_permute_cycle_decomposer.h"
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -30,10 +33,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/hlo_parser.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -118,6 +124,57 @@ CycleType ShouldDecomposeWithCycleType(
                                   : CycleType::kUnknown;
 }
 
+// Constructs the frontend attributes for the two decomposed CollectivePermute
+// instructions.
+Status GetFrontendAttributes(HloCollectivePermuteInstruction* cp,
+                             CycleType cycle_type,
+                             xla::FrontendAttributes& cp1_attr,
+                             xla::FrontendAttributes& cp2_attr) {
+  cp1_attr = cp->frontend_attributes();
+  cp2_attr = cp->frontend_attributes();
+  auto validation_it =
+      cp->frontend_attributes().map().find(kSendRecvValidationAttr);
+  if (validation_it == cp->frontend_attributes().map().end() ||
+      validation_it->second == "invalid") {
+    return OkStatus();
+  }
+
+  auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
+  if (!statusor_bounds.ok()) {
+    return statusor_bounds.status();
+  }
+  const std::vector<ReplicaGroup>& bounds = statusor_bounds.value();
+  if (bounds.size() < 2) {
+    return Internal("Invalid number of replica groups");
+  }
+
+  int64_t num_pairs = bounds.size();
+  // A forward cycle has its backedge at the end while a backward cycle has its
+  // backedge at the beginning.
+  auto backedge_start = cycle_type == CycleType::kBackward
+                            ? bounds.begin()
+                            : bounds.begin() + num_pairs - 1;
+  auto other_edges_start =
+      cycle_type == CycleType::kBackward ? bounds.begin() + 1 : bounds.begin();
+  std::vector<ReplicaGroup> cp1_bounds(backedge_start, backedge_start + 1);
+  std::vector<ReplicaGroup> cp2_bounds(other_edges_start,
+                                       other_edges_start + num_pairs - 1);
+  auto bounds_to_string = [](const std::vector<ReplicaGroup> groups) {
+    return "{" +
+           absl::StrJoin(groups, ",",
+                         [](std::string* out, const ReplicaGroup& value) {
+                           absl::StrAppend(out, "{", value.replica_ids(0), ",",
+                                           value.replica_ids(1), "}");
+                         }) +
+           "}";
+  };
+  std::string cp1_validation_str = bounds_to_string(cp1_bounds);
+  std::string cp2_validation_str = bounds_to_string(cp2_bounds);
+  (*cp1_attr.mutable_map())[kSendRecvValidationAttr] = cp1_validation_str;
+  (*cp2_attr.mutable_map())[kSendRecvValidationAttr] = cp2_validation_str;
+  return OkStatus();
+}
+
 // Decomposes a CollectivePermute instruction with a cycle in its source-target
 // pairs into two CollectivePermute instructions.
 Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
@@ -139,6 +196,8 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
   SourceTargetPairs other_edges(other_edges_start,
                                 other_edges_start + num_pairs - 1);
   const OpMetadata& metadata = cp->metadata();
+  xla::FrontendAttributes cp1_attr, cp2_attr;
+  TF_RETURN_IF_ERROR(GetFrontendAttributes(cp, cycle_type, cp1_attr, cp2_attr));
 
   // Create the CollectivePermute instruction for the communication represented
   // by the backedge.
@@ -147,6 +206,7 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
           cp->shape(), cp->mutable_operand(0), backedge,
           cp->channel_id().value()));
   cp1->set_metadata(metadata);
+  cp1->set_frontend_attributes(cp1_attr);
   int64_t cp1_receiver = backedge.back().second;
 
   // Create the CollectivePermute instruction for the communication represented
@@ -155,6 +215,7 @@ Status DecomposeCollectivePermuteCycle(HloCollectivePermuteInstruction* cp,
       computation->AddInstruction(HloInstruction::CreateCollectivePermute(
           cp->shape(), cp->mutable_operand(0), other_edges, next_channel_id));
   cp2->set_metadata(metadata);
+  cp2->set_frontend_attributes(cp2_attr);
 
   // Calculate the received data as follows:
   //   partition = u32[] partition-id()
diff --git a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
index c12e50216e9e7d..da687711f8ff70 100644
--- a/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/collective_permute_cycle_decomposer_test.cc
@@ -96,6 +96,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
         p = u32[] partition-id()
         ROOT start = u32[3,2] collective-permute(p), channel_id=1,
           source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
+          frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
           metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
       }
     )";
@@ -123,8 +124,12 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
   EXPECT_EQ(cp1->operand(0), cp2->operand(0));
   EXPECT_GT(cp2->channel_id().value(), cp1->channel_id().value());
   EXPECT_THAT(cp1->ToString(), HasSubstr("source_target_pairs={{3,0}}"));
+  EXPECT_THAT(cp1->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{3,10}}\""));
   EXPECT_THAT(cp2->ToString(),
               HasSubstr("source_target_pairs={{0,1},{1,2},{2,3}}"));
+  EXPECT_THAT(cp2->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{0,7},{1,8},{2,9}}\""));
   check_metadata(cp1);
   check_metadata(cp2);
 }
@@ -136,6 +141,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
         p = u32[] partition-id()
         ROOT start = u32[] collective-permute(p), channel_id=1,
           source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
+          frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
           metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
       }
     )";
@@ -162,8 +168,12 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
   EXPECT_EQ(cp1->operand(0), cp2->operand(0));
   EXPECT_GT(cp2->channel_id().value(), cp1->channel_id().value());
   EXPECT_THAT(cp1->ToString(), HasSubstr("source_target_pairs={{0,3}}"));
+  EXPECT_THAT(cp1->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{0,7}}\""));
   EXPECT_THAT(cp2->ToString(),
               HasSubstr("source_target_pairs={{1,0},{2,1},{3,2}}"));
+  EXPECT_THAT(cp2->ToString(),
+              HasSubstr("_xla_send_recv_validation=\"{{1,8},{2,9},{3,10}}\""));
   check_metadata(cp1);
   check_metadata(cp2);
 }
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index b2270173c70819..a9ed89cbf91ea8 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -19,9 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
-#include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -30,9 +28,9 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -123,20 +121,29 @@ bool IsCommand<HloOpcode::kConditional>(const HloInstruction* hlo,
 
 static bool IsCommand(const HloCustomCallInstruction* hlo,
                       const CommandBufferConfig& config) {
+  // cuBLAS gemms represented in the HLO as custom call instructions.
   if (config.enabled_commands.contains(DebugOptions::CUBLAS) &&
       IsLegacyCublasMatmul(*hlo)) {
     return true;
   }
 
-  if (config.enabled_commands.contains(DebugOptions::CUSTOM_CALL) &&
-      hlo->custom_call_target() == "triton_kernel_call" &&
+  if (!config.enabled_commands.contains(DebugOptions::CUSTOM_CALL)) {
+    return false;
+  }
+
+  // A special case for jax-triton kernel while it is not ported to FFI.
+  if (hlo->custom_call_target() == "triton_kernel_call" &&
       // TODO(b/327718087): This is an ugly hack to prevent capturing triton
       // custom calls that might do autotuning at run time.
       !absl::StrContains(hlo->metadata().op_name(), "Autotuner")) {
     return true;
   }
 
-  return false;
+  // Check if FFI handler is compatible with command buffers.
+  auto registration = ffi::FindHandler(hlo->custom_call_target(), "gpu");
+  return registration.ok()
+             ? ffi::IsCommandBufferCompatible(registration->traits)
+             : false;
 }
 
 static bool IsCommand(const HloInstruction* hlo,
@@ -160,6 +167,9 @@ static bool IsCommand(const HloInstruction* hlo,
           &custom_call_adaptor->instruction());
       return IsCommand(custom_call, config);
     }
+    if (custom_config.name() == "dynamic_address_computation") {
+      return false;
+    }
     return config.enabled_commands.contains(DebugOptions::FUSION);
   }
 
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 8734cde6f969fd..0eed8983af4060 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -53,8 +53,8 @@ limitations under the License.
 #include "xla/service/gpu/metrics.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
 #include "xla/service/logical_buffer.h"
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index 431b68cdcb29d5..4dee30414094a9 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/buffer_value.h"
 #include "xla/service/gpu/executable.pb.h"
 #include "xla/service/gpu/gpu_executable.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 6e7ee372173397..e0f95e2cdba32f 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -64,14 +64,14 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/proto/proto_utils.h"
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
 #include "third_party/gpus/cudnn/cudnn.h"  // IWYU pragma: keep
@@ -621,6 +621,7 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Use assignment instead of brace-list to make GCC 4.9 happy.
   RunConvOptions options;
   options.runner_cache = runner;
+  options.profile_result = &profile_result;
   // The following plan timing code is based on
   // https://github.com/NVIDIA/cudnn-frontend/blob/60496f42fdc7a4ccc059f5934e306e728a756755/include/cudnn_frontend_find_plan.h
   float max_time = 0;
@@ -633,20 +634,15 @@ absl::StatusOr<AutotuneResult> GpuConvAlgorithmPicker::AutotuneOneConvRunner(
   // Dry-run to warmup the plan.
   launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                              scratch_memory, stream, options);
-  // It is intentional that the warm-up run does not have a profile result.
-  // This avoids a timeout and error message if lazy module loading is enabled
-  // by ensuring that lazy loading happens outside the GpuTimer region.
-  options.profile_result = &profile_result;
   constexpr int kMaxIter = 10;
   // Iterate until the new measurement is within kThreshold of the current
   // minimum.
   int num_iters = 0;
-  for (; num_iters < kMaxIter && launch_status.ok(); ++num_iters) {
+  for (;
+       num_iters < kMaxIter && launch_status.ok() && profile_result.is_valid();
+       num_iters++) {
     launch_status = RunGpuConv(config, operand_buffers, result_buffers,
                                scratch_memory, stream, options);
-    if (!profile_result.is_valid()) {
-      break;
-    }
     float old_min_time = min_time;
     min_time = std::min(min_time, profile_result.elapsed_time_in_ms());
     max_time = std::max(max_time, profile_result.elapsed_time_in_ms());
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index 906780efb2d0a7..cab9e2c54a3f15 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -30,7 +30,8 @@ namespace xla {
 namespace gpu {
 
 bool IsCublasGemm(const HloInstruction& hlo) {
-  return IsLegacyCublasMatmul(hlo) || IsCublasLtMatmul(hlo);
+  return IsLegacyCublasMatmul(hlo) || IsCublasLtMatmul(hlo) ||
+         IsCublasLtMatmulF8(hlo);
 }
 
 bool IsLegacyCublasMatmul(const HloInstruction& hlo) {
diff --git a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
index 42a0b55b8bc1ac..050f219d12b6c8 100644
--- a/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
+++ b/third_party/xla/xla/service/gpu/cublas_pad_for_gemms.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/gpu/gemm_fusion.h"
 #include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/service/gpu/triton_support.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
@@ -179,7 +180,7 @@ static std::vector<HloDotInstruction*> GetRelevantDots(
                 ->config()
                 .debug_options()
                 .xla_gpu_enable_triton_gemm() &&
-            CanTritonHandleGEMM(*dot, gpu_compute_capability) &&
+            IsTritonSupportedInstruction(*dot, gpu_compute_capability) &&
             ShouldTritonHandleGEMM(*dot, gpu_compute_capability))) {
         gemms.push_back(dot);
       }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index c8fa817cf806eb..632eb42f2b5bbb 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -315,19 +315,6 @@ bool IsSupportedPrimitiveType(const HloInstruction* bmm) {
   return dtype == BF16 || dtype == F16;
 }
 
-bool IsContractingDimSupported(absl::Span<const int64_t> contracting_dims) {
-  return absl::c_all_of(contracting_dims,
-                        [](int64_t dim) { return dim == 64; });
-}
-
-bool IsNonContractingDimSupported(
-    const std::vector<int64_t>& non_contracting_dims, bool is_training) {
-  // For training, cuDNN require non_contracting_dim to be Divisible by 64
-  return absl::c_all_of(non_contracting_dims, [&](int64_t dim) {
-    return dim <= 512 && (!is_training || dim % 64 == 0);
-  });
-}
-
 std::vector<int64_t> GetDimensionVector(absl::Span<const int64_t> dimensions,
                                         absl::Span<const int64_t> dim_nums) {
   std::vector<int64_t> vec(dim_nums.size());
@@ -337,150 +324,163 @@ std::vector<int64_t> GetDimensionVector(absl::Span<const int64_t> dimensions,
   return vec;
 }
 
-absl::StatusOr<bool> IsSupportedBMM1(const HloInstruction* bmm_1,
-                                     bool is_training) {
-  const DotDimensionNumbers& dot_dims_bmm1 = bmm_1->dot_dimension_numbers();
+struct QKVLayout {
+  int64_t batch;
+  int64_t num_heads;
+  int64_t seqlen_q;
+  int64_t seqlen_kv;
+  int64_t hidden_dim;
+};
+
+absl::StatusOr<std::optional<QKVLayout>> GetQKVLayout(
+    HloInstruction* bmm_1, HloInstruction* bmm_2, bool need_canonicalization) {
+  // get layout from bmm1
+  const DotDimensionNumbers& bmm1_dnums = bmm_1->dot_dimension_numbers();
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> lhs_non_contracting_dim_nums_bmm1,
+      std::vector<int64_t> bmm1_s_q_dims,
       GetNonContractingDims(bmm_1->operand(0)->shape(),
-                            dot_dims_bmm1.lhs_batch_dimensions(),
-                            dot_dims_bmm1.lhs_contracting_dimensions()));
+                            bmm1_dnums.lhs_batch_dimensions(),
+                            bmm1_dnums.lhs_contracting_dimensions()));
+
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> rhs_non_contracting_dim_nums_bmm1,
+      std::vector<int64_t> bmm1_s_kv_dims,
       GetNonContractingDims(bmm_1->operand(1)->shape(),
-                            dot_dims_bmm1.rhs_batch_dimensions(),
-                            dot_dims_bmm1.rhs_contracting_dimensions()));
-  std::vector<int64_t> lhs_non_contracting_dims_bmm1 =
+                            bmm1_dnums.rhs_batch_dimensions(),
+                            bmm1_dnums.rhs_contracting_dimensions()));
+
+  std::vector<int64_t> bmm1_bh =
       GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         lhs_non_contracting_dim_nums_bmm1);
-  std::vector<int64_t> rhs_non_contracting_dims_bmm1 =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(),
-                         rhs_non_contracting_dim_nums_bmm1);
-  // The non contracting dimensions for BMM1 need to be less than or equal to
-  // 512.
-  if (!IsNonContractingDimSupported(lhs_non_contracting_dims_bmm1,
-                                    is_training) ||
-      !IsNonContractingDimSupported(rhs_non_contracting_dims_bmm1,
-                                    is_training)) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << "BMM1 lhs_non_contracting_dims: "
-              << absl::StrJoin(lhs_non_contracting_dims_bmm1, ",")
-              << " BMM1 rhs_non_contracting_dims: "
-              << absl::StrJoin(rhs_non_contracting_dims_bmm1, ",")
-              << " are not supported. The non-contracting dims should be less "
-                 "than 512. This is a criteria for current cuDNN 8.8 support.";
-    }
-    return false;
-  }
+                         bmm1_dnums.lhs_batch_dimensions());
+
+  std::vector<int64_t> bmm1_s_q = GetDimensionVector(
+      bmm_1->operand(0)->shape().dimensions(), bmm1_s_q_dims);
+
+  std::vector<int64_t> bmm1_s_kv = GetDimensionVector(
+      bmm_1->operand(1)->shape().dimensions(), bmm1_s_kv_dims);
 
-  std::vector<int64_t> lhs_contracting_dims_bmm1 =
+  std::vector<int64_t> bmm1_d =
       GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         dot_dims_bmm1.lhs_contracting_dimensions());
-  std::vector<int64_t> rhs_contracting_dims_bmm1 =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(),
-                         dot_dims_bmm1.rhs_contracting_dimensions());
-
-  // The contracting dimensions for BMM1 need to be 64.
-  if (!IsContractingDimSupported(lhs_contracting_dims_bmm1) ||
-      !IsContractingDimSupported(rhs_contracting_dims_bmm1)) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << "BMM1 lhs_contracting_dims: "
-              << absl::StrJoin(lhs_contracting_dims_bmm1, ",")
-              << " BMM1 rhs_contracting_dims: "
-              << absl::StrJoin(rhs_contracting_dims_bmm1, ",")
-              << " are not supported.";
-    }
-    return false;
-  }
-  return true;
-}
+                         bmm1_dnums.lhs_contracting_dimensions());
 
-absl::StatusOr<bool> IsSupportedBMM2(const HloInstruction* bmm_2,
-                                     bool need_canonicalization) {
-  const DotDimensionNumbers& dot_dims_bmm2 = bmm_2->dot_dimension_numbers();
-  // need swap lhs and rhs for bmm2 if canonicalization is needed
-  int operand_index = need_canonicalization ? 0 : 1;
-  auto batch_dim = need_canonicalization ? dot_dims_bmm2.lhs_batch_dimensions()
-                                         : dot_dims_bmm2.rhs_batch_dimensions();
-  auto contracting_dim = need_canonicalization
-                             ? dot_dims_bmm2.lhs_contracting_dimensions()
-                             : dot_dims_bmm2.rhs_contracting_dimensions();
+  TF_RET_CHECK(bmm1_bh.size() == 2);
+  TF_RET_CHECK(bmm1_s_q.size() == 1);
+  TF_RET_CHECK(bmm1_s_kv.size() == 1);
+  TF_RET_CHECK(bmm1_d.size() == 1);
 
+  // get layout from bmm2
+  const DotDimensionNumbers& bmm2_dnums = bmm_2->dot_dimension_numbers();
   TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> non_contracting_dim_nums_bmm2,
-      GetNonContractingDims(bmm_2->operand(operand_index)->shape(), batch_dim,
-                            contracting_dim));
-
-  std::vector<int64_t> non_contracting_dims_bmm2 =
-      GetDimensionVector(bmm_2->operand(operand_index)->shape().dimensions(),
-                         non_contracting_dim_nums_bmm2);
-  // The non contracting dimension for BMM2 needs to be 64 for the input matrix.
-  // The input matrix is the second argument to BMM2 i.e, rhs.
-  if (!absl::c_all_of(non_contracting_dims_bmm2,
-                      [](int64_t dim) { return dim == 64; })) {
-    if (VLOG_IS_ON(2)) {
-      VLOG(2) << " BMM2 rhs_non_contracting_dims: "
-              << absl::StrJoin(non_contracting_dims_bmm2, ",")
-              << " are not supported.";
-    }
-    return false;
+      std::vector<int64_t> bmm2_lhs_non_contracting_dims,
+      GetNonContractingDims(bmm_2->operand(0)->shape(),
+                            bmm2_dnums.lhs_batch_dimensions(),
+                            bmm2_dnums.lhs_contracting_dimensions()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> bmm2_rhs_non_contracting_dims,
+      GetNonContractingDims(bmm_2->operand(1)->shape(),
+                            bmm2_dnums.rhs_batch_dimensions(),
+                            bmm2_dnums.rhs_contracting_dimensions()));
+
+  std::vector<int64_t> bmm2_bh =
+      GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                         bmm2_dnums.lhs_batch_dimensions());
+
+  std::vector<int64_t> bmm2_s_kv =
+      GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                         bmm2_dnums.lhs_contracting_dimensions());
+
+  std::vector<int64_t> bmm2_s_q =
+      need_canonicalization
+          ? GetDimensionVector(bmm_2->operand(1)->shape().dimensions(),
+                               bmm2_rhs_non_contracting_dims)
+          : GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                               bmm2_lhs_non_contracting_dims);
+
+  std::vector<int64_t> bmm2_d =
+      need_canonicalization
+          ? GetDimensionVector(bmm_2->operand(0)->shape().dimensions(),
+                               bmm2_lhs_non_contracting_dims)
+          : GetDimensionVector(bmm_2->operand(1)->shape().dimensions(),
+                               bmm2_rhs_non_contracting_dims);
+
+  TF_RET_CHECK(bmm2_bh.size() == 2);
+  TF_RET_CHECK(bmm2_s_q.size() == 1);
+  TF_RET_CHECK(bmm2_s_kv.size() == 1);
+  TF_RET_CHECK(bmm2_d.size() == 1);
+
+  // check if bhsd is correct between bmm1 and bmm2
+  if (bmm1_bh[0] != bmm2_bh[0] || bmm1_bh[1] != bmm2_bh[1] ||
+      bmm1_s_q[0] != bmm2_s_q[0] || bmm1_s_kv[0] != bmm2_s_kv[0] ||
+      bmm1_d[0] != bmm2_d[0]) {
+    return std::nullopt;
   }
-  return true;
+
+  QKVLayout qkv_layout;
+  qkv_layout.batch = bmm1_bh[0];
+  qkv_layout.num_heads = bmm1_bh[1];
+  qkv_layout.seqlen_q = bmm1_s_q[0];
+  qkv_layout.seqlen_kv = bmm1_s_kv[0];
+  qkv_layout.hidden_dim = bmm1_d[0];
+  return qkv_layout;
 }
 
-absl::StatusOr<bool> IsFlashAttention(
-    HloInstruction* bmm_1, bool is_causal_mask,
-    absl::string_view custom_call_name,
+absl::StatusOr<bool> IsFusedAttention(
+    QKVLayout qkv_layout, bool is_training,
     stream_executor::CudaComputeCapability cc,
     stream_executor::dnn::VersionInfo cudnn_version) {
-  const DotDimensionNumbers& dnums = bmm_1->dot_dimension_numbers();
-  TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> seq_q_dims,
-      GetNonContractingDims(bmm_1->operand(0)->shape(),
-                            dnums.lhs_batch_dimensions(),
-                            dnums.lhs_contracting_dimensions()));
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<int64_t> seq_k_dims,
-      GetNonContractingDims(bmm_1->operand(1)->shape(),
-                            dnums.rhs_batch_dimensions(),
-                            dnums.rhs_contracting_dimensions()));
-
-  std::vector<int64_t> seq_q =
-      GetDimensionVector(bmm_1->operand(0)->shape().dimensions(), seq_q_dims);
+  // otherwise check if it is supported by regular attention
+  int64_t s_q = qkv_layout.seqlen_q;
+  int64_t s_kv = qkv_layout.seqlen_kv;
+  int64_t hidden_dim = qkv_layout.hidden_dim;
+  bool is_seqlen_supported =
+      (s_q <= 512 && s_kv <= 512) &&
+      (!is_training || (s_q % 64 == 0 && s_kv % 64 == 0));
+  bool is_hidden_dim_supported = hidden_dim == 64;
+  bool is_fused_attention = is_seqlen_supported && is_hidden_dim_supported;
+  return is_fused_attention;
+}
 
-  std::vector<int64_t> seq_k =
-      GetDimensionVector(bmm_1->operand(1)->shape().dimensions(), seq_k_dims);
+absl::StatusOr<bool> IsFlashAttention(
+    QKVLayout qkv_layout, bool is_training,
+    stream_executor::CudaComputeCapability cc,
+    stream_executor::dnn::VersionInfo cudnn_version) {
+  int64_t s_q = qkv_layout.seqlen_q;
+  int64_t s_kv = qkv_layout.seqlen_kv;
+  int64_t hidden_dim = qkv_layout.hidden_dim;
+  // start with most relaxed constraint
+  bool is_seqlen_supported = (s_q > 512 || s_kv > 512) &&
+                             (!is_training || (s_q % 2 == 0 && s_kv % 2 == 0));
+  bool is_hidden_dim_supported = hidden_dim <= 128 && hidden_dim % 8 == 0;
+  bool is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
+  if (!is_flash_attention) return false;
+  // going backwards to check compatibility
+  if ((is_training && (s_q < 64 || s_kv < 64)) &&
+      !IsComputeCapabilityAndCudnnSupported(
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(9, 0, 0))) {
+    VLOG(2) << "Flash attention training with seq < 64 not supported cuDNN < "
+               "9.0.0.";
+    return false;
+  }
 
-  std::vector<int64_t> hidden_dim =
-      GetDimensionVector(bmm_1->operand(0)->shape().dimensions(),
-                         dnums.lhs_contracting_dimensions());
-  // for now, seq_q and seq_k should be equal for flash attention to work
-  // flash attention only supports fixed topology so we check if custom call is
-  // such topology by checking custom_call_name
-  TF_RET_CHECK(seq_q.size() == 1);
-  TF_RET_CHECK(seq_k.size() == 1);
-  TF_RET_CHECK(hidden_dim.size() == 1);
-
-  auto is_seqlen_supported = seq_q[0] > 512 && seq_k[0] > 512 &&
-                             seq_q[0] % 64 == 0 && seq_k[0] % 64 == 0;
-  auto is_hidden_dim_supported = hidden_dim[0] == 64 || hidden_dim[0] == 128;
-  auto is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
-  auto is_cross_attention = seq_q[0] != seq_k[0];
-
-  // flash attention requires cuDNN 8.9.3 to run non-fused QKV
-  // once we have fused QKV support, we can relax this contraint
-  if (is_flash_attention &&
+  if ((hidden_dim != 64 && hidden_dim != 128) &&
       !IsComputeCapabilityAndCudnnSupported(
-          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 3))) {
-    VLOG(2) << "Require cuDNN 8.9.3 to run flash attention.";
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 6))) {
+    VLOG(2) << "Flash attention head dim != 64 or 128 not supported with cuDNN "
+               "< 8.9.6.";
     return false;
   }
-  // flash attention cross attention requires cuDNN 8.9.4 to run
-  if (is_cross_attention &&
+
+  if ((is_training && s_kv % 64 != 0) &&
       !IsComputeCapabilityAndCudnnSupported(
+          cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 5))) {
+    VLOG(2) << "Flash attention training with seq kv % 64 != 0 not supported "
+               "with cuDNN < 8.9.5.";
+    return false;
+  }
+
+  if (!IsComputeCapabilityAndCudnnSupported(
           cc, cudnn_version, stream_executor::dnn::VersionInfo(8, 9, 4))) {
-    VLOG(2) << "Require cuDNN 8.9.4 to run flash cross attention.";
+    VLOG(2) << "Require cuDNN 8.9.4 to run flash attention.";
     return false;
   }
   return is_flash_attention;
@@ -1239,10 +1239,19 @@ absl::StatusOr<bool> IsMHABlockSupported(
     return false;
   }
 
+  // get batch/num heads/sequence length/hidden dim from bmm1 and bmm2
+  // also make sure they are the same between bmm1 and bmm2
+  TF_ASSIGN_OR_RETURN(std::optional<QKVLayout> qkv_layout,
+                      GetQKVLayout(bmm_1, bmm_2, need_canonicalization));
+  if (!qkv_layout.has_value()) {
+    VLOG(2) << "bmm1 and bmm2 have different qkv layout.";
+    return false;
+  }
+
   // check if matched attention block is supported by cuDNN flash attention.
-  TF_ASSIGN_OR_RETURN(is_flash_attention,
-                      IsFlashAttention(bmm_1, is_causal_mask, custom_call_name,
-                                       cc, cudnn_version));
+  TF_ASSIGN_OR_RETURN(
+      is_flash_attention,
+      IsFlashAttention(qkv_layout.value(), is_training, cc, cudnn_version));
   if (is_flash_attention) {
     if (is_causal_mask) {
       // if bias is causal mask, needs to remove bias from name
@@ -1259,14 +1268,11 @@ absl::StatusOr<bool> IsMHABlockSupported(
     }
     return true;
   }
-  // otherwise check if it is supported by regular attention
-  TF_ASSIGN_OR_RETURN(bool is_bmm1_supported,
-                      IsSupportedBMM1(bmm_1, is_training));
-  if (!is_bmm1_supported) return false;
-  TF_ASSIGN_OR_RETURN(bool is_bmm2_supported,
-                      IsSupportedBMM2(bmm_2, need_canonicalization));
-  if (!is_bmm2_supported) return false;
-  return true;
+  // check if matched attention block is supported by cuDNN fused attention.
+  TF_ASSIGN_OR_RETURN(
+      bool is_fused_attention,
+      IsFusedAttention(qkv_layout.value(), is_training, cc, cudnn_version));
+  return is_fused_attention;
 }
 
 absl::StatusOr<HloInstruction*> CanonicalizeBatchedGemmForcuDNNFMHA(
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index d9dea364d70482..b52fa5be2c598d 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -90,13 +90,6 @@ class CudnnFusedMhaRewriterTestHloTest : public HloTestBase {
   }
 
   se::dnn::VersionInfo GetCudnnVersionWithFlashAttentionSupport() {
-    // Fake a supported compute capability to run tests,
-    // we don't run any kernels in these tests so they should be safe
-    // to run anywhere.
-    return se::dnn::VersionInfo(8, 9, 3);
-  }
-
-  se::dnn::VersionInfo GetCudnnVersionWithFlashCrossAttentionSupport() {
     // Fake a supported compute capability to run tests,
     // we don't run any kernels in these tests so they should be safe
     // to run anywhere.
@@ -754,7 +747,7 @@ ENTRY main.41 {
   convert.40 = bf16[16,16,256,256]{3,2,1,0} convert(add.15)
   constant.4 = bf16[] constant(0)
   broadcast.5 = bf16[16,16,256,256]{3,2,1,0} broadcast(constant.4), dimensions={}
-  compare = pred[16,16,256,256]{3,2,1,0} compare(convert.40, broadcast.5), direction=GT 
+  compare = pred[16,16,256,256]{3,2,1,0} compare(convert.40, broadcast.5), direction=GT
   select.13 = bf16[16,16,256,256]{3,2,1,0} select(compare, convert.40, broadcast.5)
   convert.36 = f32[16,16,256,256]{3,2,1,0} convert(select.13)
   constant.9 = f32[] constant(-inf)
@@ -4361,73 +4354,6 @@ ENTRY main.82 {
   EXPECT_EQ(config.is_causal_mask(), false);
 }
 
-TEST_F(CudnnFusedMhaRewriterTestHloTest,
-       FlashAttentionF16Bmm1BiasSoftmaxBmm2PatternCrossAttention) {
-  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
-  const char* module_str = R"(
-HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(f16[2,6,2048,64]{3,2,1,0},f16[2,6,64,1024]{3,2,1,0},f16[2,6,1024,64]{3,2,1,0},f16[2,6,2048,1024]{3,2,1,0})->f16[2,6,2048,64]{3,2,1,0}}
-
-region_0.7 {
-  Arg_0.8 = f16[] parameter(0)
-  Arg_1.9 = f16[] parameter(1)
-  ROOT maximum = f16[] maximum(Arg_0.8, Arg_1.9)
-}
-
-region_1.19 {
-  Arg_0.20 = f32[] parameter(0)
-  Arg_1.21 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0.20, Arg_1.21)
-}
-
-ENTRY main.31 {
-  Arg_0.1 = f16[2,6,2048,64]{3,2,1,0} parameter(0), sharding={replicated}
-  Arg_1.2 = f16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
-  dot = f16[2,6,2048,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-  Arg_3.4 = f16[2,6,2048,1024]{3,2,1,0} parameter(3), sharding={replicated}
-  add.1 = f16[2,6,2048,1024]{3,2,1,0} add(dot, Arg_3.4)
-  constant = f16[] constant(-inf)
-  reduce.11 = f16[2,6,2048]{2,1,0} reduce(add.1, constant), dimensions={3}, to_apply=region_0.7
-  broadcast.3 = f16[2,6,2048,1024]{3,2,1,0} broadcast(reduce.11), dimensions={0,1,2}
-  subtract.1 = f16[2,6,2048,1024]{3,2,1,0} subtract(add.1, broadcast.3)
-  exponential.1 = f16[2,6,2048,1024]{3,2,1,0} exponential(subtract.1)
-  convert.1 = f32[2,6,2048,1024]{3,2,1,0} convert(exponential.1)
-  constant.1 = f32[] constant(0)
-  reduce.23 = f32[2,6,2048]{2,1,0} reduce(convert.1, constant.1), dimensions={3}, to_apply=region_1.19
-  convert.2 = f16[2,6,2048]{2,1,0} convert(reduce.23)
-  broadcast.4 = f16[2,6,2048,1024]{3,2,1,0} broadcast(convert.2), dimensions={0,1,2}
-  divide = f16[2,6,2048,1024]{3,2,1,0} divide(exponential.1, broadcast.4)
-  Arg_2.3 = f16[2,6,1024,64]{3,2,1,0} parameter(2), sharding={replicated}
-  ROOT dot.1 = f16[2,6,2048,64]{3,2,1,0} dot(divide, Arg_2.3), lhs_contracting_dims={3}, rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
-})";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
-  CudnnFusedMHARewriter fusedMhaRewriter{
-      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
-  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
-
-  SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(m->entry_computation()->root_instruction(), GmockMatch(m::Dot()));
-
-  CudnnFusedMHARewriter fusedMhaRewriterWithCrossAttention{
-      GetCudaComputeCapability(),
-      GetCudnnVersionWithFlashCrossAttentionSupport()};
-  TF_ASSERT_OK(
-      RunHloPass(&fusedMhaRewriterWithCrossAttention, m.get()).status());
-  SCOPED_TRACE(m->ToString());
-  const HloInstruction* fmha;
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHAScaleBiasSoftmaxCallTarget}), 0)
-              .WithShape(F16, {2, 6, 2048, 64})));
-  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
-  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(config.is_flash_attention(), true);
-  EXPECT_EQ(config.is_causal_mask(), false);
-}
-
 // GPT3 pattern
 TEST_F(CudnnFusedMhaRewriterTestHloTest, FlashAttentionBF16TrainingGPT3_5B) {
   if (skip_reason_) GTEST_SKIP() << *skip_reason_;
@@ -5194,6 +5120,118 @@ ENTRY main.164_spmd {
               }))))));
 }
 
+constexpr absl::string_view hlo_should_lower_to_flash_attention = R"(
+HloModule fmha_test, entry_computation_layout={(bf16[16,16,128,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0},bf16[16,16,1024,64]{3,2,1,0})->bf16[16,16,128,64]{3,2,1,0}}
+ENTRY main.6 {
+  Arg_0.1 = bf16[16,16,128,64]{3,2,1,0} parameter(0)
+  Arg_1.2 = bf16[16,16,1024,64]{3,2,1,0} parameter(1)
+  Arg_2.3 = bf16[16,16,1024,64]{3,2,1,0} parameter(2)
+  dot.0 = bf16[16,16,128,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}, metadata={}
+  ROOT dot.1 = bf16[16,16,128,64]{3,2,1,0} dot(dot.0, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}, metadata={}
+})";
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, ShouldLowerToFlashAttention) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(hlo_should_lower_to_flash_attention,
+                                           GetModuleConfig()));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::GetTupleElement(
+                     m::CustomCall(&fmha, {kCudnnfMHABmmBmmCallTarget}), 0)
+                     .WithShape(BF16, {16, 16, 128, 64})));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(config.fmha_scale(), 1.0);
+  EXPECT_EQ(config.dropout_rate(), 0.0);
+  EXPECT_EQ(config.is_flash_attention(), true);
+}
+
+constexpr absl::string_view hlo_head_dim_not_multiple_of_64 = R"(
+HloModule jit__reference, entry_computation_layout={(f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0}, f16[4,48,1024,16]{3,2,1,0})->f16[4,48,1024,16]{3,2,1,0}}
+
+region_0.26 {
+  Arg_0.27 = f32[] parameter(0)
+  Arg_1.28 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(Arg_0.27, Arg_1.28)
+}
+
+region_1.37 {
+  Arg_0.38 = f32[] parameter(0)
+  Arg_1.39 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.38, Arg_1.39)
+}
+
+ENTRY main.49 {
+  iota.2 = s32[1024,1024]{1,0} iota(), iota_dimension=0
+  iota.3 = s32[1024,1024]{1,0} iota(), iota_dimension=1
+  compare = pred[1024,1024]{1,0} compare(iota.2, iota.3), direction=GE
+  broadcast.4 = pred[4,48,1024,1024]{3,2,1,0} broadcast(compare), dimensions={2,3}
+  Arg_0.1 = f16[4,48,1024,16]{3,2,1,0} parameter(0)
+  Arg_1.2 = f16[4,48,1024,16]{3,2,1,0} parameter(1)
+  dot.9 = f16[4,48,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  constant.4 = f16[] constant(0.5)
+  broadcast.6 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant.4), dimensions={}
+  multiply = f16[4,48,1024,1024]{3,2,1,0} multiply(dot.9, broadcast.6)
+  constant = f16[] constant(-inf)
+  broadcast.7 = f16[4,48,1024,1024]{3,2,1,0} broadcast(constant), dimensions={}
+  select.1 = f16[4,48,1024,1024]{3,2,1,0} select(broadcast.4, multiply, broadcast.7)
+  convert.1 = f32[4,48,1024,1024]{3,2,1,0} convert(select.1)
+  constant.7 = f32[] constant(-inf)
+  reduce.30 = f32[4,48,1024]{2,1,0} reduce(convert.1, constant.7), dimensions={3}, to_apply=region_0.26
+  broadcast.8 = f32[4,48,1024,1024]{3,2,1,0} broadcast(reduce.30), dimensions={0,1,2}
+  subtract = f32[4,48,1024,1024]{3,2,1,0} subtract(convert.1, broadcast.8)
+  exponential = f32[4,48,1024,1024]{3,2,1,0} exponential(subtract)
+  constant.6 = f32[] constant(0)
+  reduce.41 = f32[4,48,1024]{2,1,0} reduce(exponential, constant.6), dimensions={3}, to_apply=region_1.37
+  broadcast.9 = f32[4,48,1024,1024]{3,2,1,0} broadcast(reduce.41), dimensions={0,1,2}
+  divide = f32[4,48,1024,1024]{3,2,1,0} divide(exponential, broadcast.9)
+  convert.2 = f16[4,48,1024,1024]{3,2,1,0} convert(divide)
+  Arg_2.3 = f16[4,48,1024,16]{3,2,1,0} parameter(2)
+  ROOT dot.48 = f16[4,48,1024,16]{3,2,1,0} dot(convert.2, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+} // main.49
+)";
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest, HeadDimNotMultipleOf64) {
+  if (skip_reason_) GTEST_SKIP() << *skip_reason_;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto m, ParseAndReturnVerifiedModule(hlo_head_dim_not_multiple_of_64,
+                                           GetModuleConfig()));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+
+  // head dim not a multiple of 64 should not be lowered with cuDNN < 8.9.6
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(), GmockMatch(m::Dot()));
+
+  // should be lowered with cuDNN >= 8.9.6
+  CudnnFusedMHARewriter fusedMhaRewriterWithcuDNN8907{
+      GetCudaComputeCapability(), se::dnn::VersionInfo(8, 9, 7)};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriterWithcuDNN8907, m.get()).status());
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
+              .WithShape(F16, {4, 48, 1024, 16})));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(config.fmha_scale(), 0.5);
+  EXPECT_EQ(config.dropout_rate(), 0.0);
+  EXPECT_EQ(config.is_flash_attention(), true);
+}
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
index 3b4a5c4cc5b825..da304e5ebdf1f6 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/escaping.h"
 #include "absl/strings/string_view.h"
+#include "third_party/gpus/cudnn/cudnn_version.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
@@ -42,7 +42,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
@@ -50,7 +49,6 @@ limitations under the License.
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/stream_executor/cuda/cuda_dnn.h"
 #include "xla/stream_executor/cuda/cudnn_frontend_helpers.h"
-#include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -68,18 +66,62 @@ inline std::optional<fe::PointwiseMode_t> GetElementwiseMode(
   const HloOpcode opcode = instruction.opcode();
   using m = fe::PointwiseMode_t;
   switch (opcode) {
+    case HloOpcode::kAbs:
+      return m::ABS;
     case HloOpcode::kAdd:
       return m::ADD;
+    case HloOpcode::kCompare:
+      switch (instruction.comparison_direction()) {
+        case Comparison::Direction::kEq:
+          return m::CMP_EQ;
+        case Comparison::Direction::kNe:
+          return m::CMP_NEQ;
+        case Comparison::Direction::kGe:
+          return m::CMP_GE;
+        case Comparison::Direction::kGt:
+          return m::CMP_GT;
+        case Comparison::Direction::kLe:
+          return m::CMP_LE;
+        case Comparison::Direction::kLt:
+          return m::CMP_LT;
+      }
+      break;
     case HloOpcode::kConvert:
       return m::IDENTITY;
+    case HloOpcode::kCos:
+      return m::COS;
     case HloOpcode::kDivide:
       return m::DIV;
+    case HloOpcode::kExp:
+      return m::EXP;
+    case HloOpcode::kLog:
+      return m::LOG;
+    case HloOpcode::kMaximum:
+      return m::MAX;
+    case HloOpcode::kMinimum:
+      return m::MIN;
     case HloOpcode::kMultiply:
       return m::MUL;
     case HloOpcode::kNegate:
       return m::NEG;
+    case HloOpcode::kPower:
+      return m::POW;
+    case HloOpcode::kRsqrt:
+      return m::RSQRT;
+#if CUDNN_VERSION >= 90100
+    case HloOpcode::kSelect:
+      return m::BINARY_SELECT;
+#endif  // CUDNN_VERSION
+    case HloOpcode::kSin:
+      return m::SIN;
+    case HloOpcode::kSqrt:
+      return m::SQRT;
     case HloOpcode::kSubtract:
       return m::SUB;
+    case HloOpcode::kTan:
+      return m::TAN;
+    case HloOpcode::kTanh:
+      return m::TANH_FWD;
     default:
       return std::nullopt;
   }
@@ -98,17 +140,26 @@ inline std::optional<fe::DataType_t> ToCudnnDataType(const PrimitiveType type) {
       return t::INT32;
     case PrimitiveType::S8:
       return t::INT8;
+    case PrimitiveType::PRED:
+      return t::INT8;
     default:
       return std::nullopt;
   }
 }
 
+int FusionLevel(const HloInstruction& hlo) {
+  return hlo.GetModule()
+      ->config()
+      .debug_options()
+      .xla_gpu_cudnn_gemm_fusion_level();
+};
+
 // Extracts dimensions and strides from HLO tensors in the format expected by
 // cuDNN.
 class GemmDimensionAdapter {
   explicit GemmDimensionAdapter(const HloDotInstruction& dot,
                                 TritonFusionAnalysis analysis)
-      : analysis_(std::move(analysis)), dot_(dot){};
+      : analysis_(std::move(analysis)), dot_(dot) {};
 
  public:
   const TritonFusionAnalysis analysis_;
@@ -139,17 +190,21 @@ class GemmDimensionAdapter {
                             std::vector<int64_t>& strides) {
     const DotDimensionNumbers& dims = dot_.dot_dimension_numbers();
     // GEMM fusions require a specific canonical order of dimensions.
+    constexpr int kBatchDimensionIndex = 0;
+    constexpr int kOutputLHSNonContractingDimensionIndex = 1;
     std::vector<int64_t> dim_indices;
+    int lhs_noncontracting_index = -1;
     switch (scope) {
       case TritonFusionAnalysis::Scope::LHS:
-        dim_indices = {dims.lhs_batch_dimensions().empty()
-                           ? -1
-                           : dims.lhs_batch_dimensions(0),
-                       GetNonContractingDims(dot_.operand(0)->shape(),
-                                             dims.lhs_batch_dimensions(),
-                                             dims.lhs_contracting_dimensions())
-                           .value()[0],
-                       dims.lhs_contracting_dimensions(0)};
+        lhs_noncontracting_index =
+            GetNonContractingDims(dot_.operand(0)->shape(),
+                                  dims.lhs_batch_dimensions(),
+                                  dims.lhs_contracting_dimensions())
+                .value()[0];
+        dim_indices = {
+            dims.lhs_batch_dimensions().empty() ? -1
+                                                : dims.lhs_batch_dimensions(0),
+            lhs_noncontracting_index, dims.lhs_contracting_dimensions(0)};
         break;
       case TritonFusionAnalysis::Scope::RHS:
         dim_indices = {dims.rhs_batch_dimensions().empty()
@@ -162,8 +217,9 @@ class GemmDimensionAdapter {
                            .value()[0]};
         break;
       case TritonFusionAnalysis::Scope::OUTPUT:
+        lhs_noncontracting_index = dot_.shape().rank() - 2;
         dim_indices = {dims.lhs_batch_dimensions().empty() ? -1 : 0,
-                       dot_.shape().rank() - 2, dot_.shape().rank() - 1};
+                       lhs_noncontracting_index, dot_.shape().rank() - 1};
         break;
       case TritonFusionAnalysis::Scope::META:
         LOG(FATAL) << "Unsupported scope.";
@@ -177,17 +233,67 @@ class GemmDimensionAdapter {
         strides.push_back(strides.empty() ? 1 : strides.back());
         continue;
       } else {
-        if (spec->size() != 1) {
+        if (spec->size() == 1) {
+          // The dimension is not split, nothing to do.
+        } else if (spec->size() == 2) {
+          if (FusionLevel(hlo) < 3) {
+            return false;
+          }
+          if (!dims.lhs_batch_dimensions().empty()) {
+            VLOG(8) << "Noncontracting dimension split is not compatible with "
+                       "batch dimensions.";
+            return false;
+          }
+          if (index != lhs_noncontracting_index) {
+            VLOG(8) << "Only LHS noncontracting dimension can be split.";
+            return false;
+          }
+          switch (scope) {
+            case TritonFusionAnalysis::Scope::LHS:
+              lhs_noncontracting_split = spec->back().count;
+              break;
+            case TritonFusionAnalysis::Scope::OUTPUT:
+              if (lhs_noncontracting_split != spec->back().count) {
+                VLOG(8) << "Output non-contracting dimension has to be split "
+                           "the same way as the LHS input one if it is split.";
+                return false;
+              }
+              break;
+            default:
+              VLOG(8) << "Only LHS noncontracting dimension can be split.";
+              return false;
+          }
+          // Assign the major part of the noncontracting dimension to the
+          // unused batch one.
+          CHECK_EQ(dimensions[kBatchDimensionIndex], 1);
+          dimensions[kBatchDimensionIndex] = spec->back().count;
+          strides[kBatchDimensionIndex] = spec->back().stride;
+        } else {
+          VLOG(8) << "The dimension is split multiple times.";
           return false;
         }
         dimensions.push_back(spec->front().count);
         strides.push_back(spec->front().stride);
       }
     }
+    if (lhs_noncontracting_split > 1 &&
+        scope == TritonFusionAnalysis::Scope::OUTPUT &&
+        dimensions[kBatchDimensionIndex] == 1) {
+      // LHS input noncontracting dimension is split but the corresponding
+      // output one is not. Assign part of the output one to the unused batch
+      // dimension.
+      dimensions[kBatchDimensionIndex] = lhs_noncontracting_split;
+      dimensions[kOutputLHSNonContractingDimensionIndex] /=
+          lhs_noncontracting_split;
+      strides[kBatchDimensionIndex] =
+          strides[kOutputLHSNonContractingDimensionIndex] *
+          dimensions[kOutputLHSNonContractingDimensionIndex];
+    }
     return true;
   }
 
  private:
+  int64_t lhs_noncontracting_split = 1;
   const HloDotInstruction& dot_;
 };
 
@@ -254,7 +360,9 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
     } else if (hlo->opcode() == HloOpcode::kReshape ||
                hlo->opcode() == HloOpcode::kBitcast ||
                hlo->opcode() == HloOpcode::kTranspose ||
-               hlo->opcode() == HloOpcode::kCopy) {
+               hlo->opcode() == HloOpcode::kCopy ||
+               (FusionLevel(fusion) >= 2 &&
+                hlo->opcode() == HloOpcode::kBroadcast)) {
       // All these are accounted for separately as transformations of strides.
       hlo_to_cudnn[hlo] = operand(0);
     } else if (hlo->IsElementwise()) {
@@ -275,8 +383,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
       } else if (hlo->operand_count() == 2) {
         hlo_to_cudnn[hlo] = graph.pointwise(operand(0), operand(1), attrs);
       } else if (hlo->operand_count() == 3) {
+        if (hlo->opcode() != HloOpcode::kSelect) {
+          VLOG(3) << "Unexpected ternary operation: " << hlo->ToString();
+          return std::nullopt;
+        }
+        // Operand order for select differs between HLO and cuDNN.
         hlo_to_cudnn[hlo] =
-            graph.pointwise(operand(0), operand(1), operand(2), attrs);
+            graph.pointwise(operand(1), operand(2), operand(0), attrs);
       } else {
         VLOG(3) << "Unimplemented elementwise operation.";
         return std::nullopt;
@@ -330,13 +443,13 @@ absl::StatusOr<std::optional<se::gpu::CudnnGraph>> HloFusionToCuDnnGraph(
 
 // Creates a cuDNN graph, queries cuDNN whether it is supported.
 absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
-    const HloFusionInstruction& hlo, se::Stream& stream) {
+    se::dnn::DnnSupport& dnn_support, const HloFusionInstruction& hlo) {
   TF_ASSIGN_OR_RETURN(std::optional<se::gpu::CudnnGraph> graph,
                       HloFusionToCuDnnGraph(hlo));
   if (!graph.has_value()) {
     return absl::InternalError("Construction of cuDNN graph failed.");
   }
-  TF_ASSIGN_OR_RETURN(bool supported, graph->Prepare());
+  TF_ASSIGN_OR_RETURN(bool supported, graph->Prepare(dnn_support));
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
@@ -345,7 +458,10 @@ absl::StatusOr<se::gpu::CudnnGraph> PrepareGraph(
 
 class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
  public:
-  explicit CuDnnFusionVisitor(const AutotuneConfig& config) : config_(config) {}
+  explicit CuDnnFusionVisitor(
+      se::dnn::DnnSupport& dnn_support,
+      CuDnnFusionCompiler::BinaryMap& compilation_results)
+      : dnn_support_(dnn_support), compilation_results_(compilation_results) {}
 
   absl::Status HandleFusion(HloInstruction* hlo) override {
     TF_ASSIGN_OR_RETURN(auto gpu_config,
@@ -356,10 +472,6 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
     }
     int64_t plan_id = -1;
     if (fusion_backend_config.has_cudnn_fusion_config()) {
-      if (fusion_backend_config.cudnn_fusion_config().has_serialized_graph()) {
-        VLOG(4) << "Skipping already serialized " << hlo->ToShortString();
-        return absl::OkStatus();
-      }
       plan_id = fusion_backend_config.cudnn_fusion_config().plan_id();
     }
 
@@ -368,27 +480,25 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
     const std::string cache_key =
         GetComputationFingerprint(hlo->fused_instructions_computation(), {});
-    std::string& cache_entry = compilation_cache_[cache_key];
+    std::string& cache_entry = compilation_results_[cache_key];
     if (cache_entry.empty()) {
-      TF_ASSIGN_OR_RETURN(se::Stream * stream, config_.GetStream());
-
       TF_ASSIGN_OR_RETURN(
           se::gpu::CudnnGraph graph,
-          PrepareGraph(*DynCast<HloFusionInstruction>(hlo), *stream));
+          PrepareGraph(dnn_support_, *DynCast<HloFusionInstruction>(hlo)));
 
       if (plan_id >= 0) {
         // Build single plan with given ID.
         if (plan_id >= graph.Graph().get_execution_plan_count()) {
           return absl::InternalError("cuDNN graph plan does not exist.");
         }
-        TF_RETURN_IF_ERROR(graph.Build(plan_id));
+        TF_RETURN_IF_ERROR(graph.Build(dnn_support_, plan_id));
       } else {
         // Build plans one by one till first successful when no plan_id was
         // provided.
         for (plan_id = 0; plan_id < graph.Graph().get_execution_plan_count();
              ++plan_id) {
           VLOG(7) << "Trying plan ID " << plan_id;
-          if (graph.Build(plan_id).ok()) {
+          if (graph.Build(dnn_support_, plan_id).ok()) {
             VLOG(7) << "Successfully built plan ID " << plan_id;
             break;
           }
@@ -405,16 +515,15 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
 
       std::vector<uint8_t> serialized_graph;
       RETURN_IF_CUDNN_FRONTEND_ERROR(graph.Graph().serialize(serialized_graph));
-      cache_entry = absl::CEscape(
-          absl::string_view(reinterpret_cast<char*>(serialized_graph.data()),
-                            serialized_graph.size()));
+      cache_entry =
+          std::string(reinterpret_cast<char*>(serialized_graph.data()),
+                      serialized_graph.size());
     } else {
       VLOG(4) << "Cache hit.";
     }
     auto cudnn_config = gpu_config.mutable_fusion_backend_config()
                             ->mutable_cudnn_fusion_config();
     cudnn_config->set_plan_id(plan_id);
-    cudnn_config->set_serialized_graph(cache_entry);
     TF_RETURN_IF_ERROR(hlo->set_backend_config(gpu_config));
 
     MarkAsChanged();
@@ -422,9 +531,9 @@ class CuDnnFusionVisitor : public DfsHloRewriteVisitor {
   }
 
  private:
-  AutotuneConfig config_;
+  se::dnn::DnnSupport& dnn_support_;
   // <HLO computation fingerprint, serialized compiled cuDNN graph>.
-  absl::flat_hash_map<std::string, std::string> compilation_cache_;
+  CuDnnFusionCompiler::BinaryMap& compilation_results_;
 };
 
 }  // namespace
@@ -433,13 +542,13 @@ absl::StatusOr<bool> CuDnnFusionCompiler::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_SCOPED_LOGGING_TIMER("cuDNN fusion compiler");
-  return CuDnnFusionVisitor(config_).RunOnModule(module, execution_threads);
+  return CuDnnFusionVisitor(dnn_support_, compilation_results_)
+      .RunOnModule(module, execution_threads);
 }
 
 int CuDnnFusionCompiler::GetAvailablePlanCount(
-    const HloFusionInstruction& hlo) const {
-  se::Stream& stream = *config_.GetStream().value();
-  auto graph = PrepareGraph(hlo, stream);
+    se::StreamExecutor& stream_exec, const HloFusionInstruction& hlo) {
+  auto graph = PrepareGraph(*stream_exec.AsDnn(), hlo);
   if (!graph.ok()) {
     return 0;
   }
diff --git a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
index ffd512159ecbb5..e5ce4ddefa7b6d 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
+++ b/third_party/xla/xla/service/gpu/cudnn_fusion_compiler.h
@@ -21,19 +21,24 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
 
 // Converts HLO fusions with cuDNN backend config to cuDNN graphs,
-// compiles them using a cuDNN handle and stores them in the
-// backend config in serialized form.
+// compiles them using a cuDNN handle and serializes them.
 class CuDnnFusionCompiler : public HloModulePass {
  public:
-  explicit CuDnnFusionCompiler(const AutotuneConfig& config)
-      : config_(config) {}
+  // <HLO computation fingerprint, serialized compiled cuDNN graph>.
+  using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
+  explicit CuDnnFusionCompiler(se::StreamExecutor& stream_exec,
+                               BinaryMap& compilation_results)
+      : dnn_support_(*stream_exec.AsDnn()),
+        compilation_results_(compilation_results) {}
 
   absl::string_view name() const override { return "cudnn-fusion-compiler"; }
 
@@ -42,10 +47,12 @@ class CuDnnFusionCompiler : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
-  int GetAvailablePlanCount(const HloFusionInstruction& hlo) const;
+  static int GetAvailablePlanCount(se::StreamExecutor& stream_exec,
+                                   const HloFusionInstruction& hlo);
 
  private:
-  AutotuneConfig config_;
+  se::dnn::DnnSupport& dnn_support_;
+  BinaryMap& compilation_results_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
index 732ed968a405ea..ec966e18085b11 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
@@ -98,8 +98,9 @@ struct NormMetadata {
   // cuDNN. Nullptr if no transposes were inserted.
   HloInstruction *x_transpose, *y_transpose;
   // The reduction and non-reduction dimensions of the input into the forward
-  // layer norm before the potential application of transposes.
-  std::vector<int64_t> norm_dims, non_norm_dims;
+  // layer norm before the potential application of transposes and adjusted for
+  // the removal of any degenerate dimensions in the input to the norm.
+  std::vector<int64_t> norm_dims_adjusted, non_norm_dims_adjusted;
 };
 
 // Map from the instruction pointer of a layer norm Custom Call to its metadata.
@@ -171,6 +172,45 @@ bool CompatibleElementType(const HloInstruction* instr) {
   return element_type == BF16 || element_type == F16 || element_type == F32;
 }
 
+// Returns the dimensions associated with shape, adjusted for the removal of any
+// degenerate dimensions in shape. Specifically, for each dimension d in
+// dimensions, returns the new index of d if all dimensions of size 1 are
+// removed from shape. If d has size 1, it is not included in the returned
+// vector.
+std::vector<int64_t> AdjustedDimensions(const Shape& shape,
+                                        absl::Span<const int64_t> dimensions) {
+  absl::flat_hash_map<int64_t, int64_t> dimension_map;
+  for (int64_t dimension = 0, non_degen_dimension = 0; dimension < shape.rank();
+       ++dimension) {
+    if (shape.dimensions(dimension) > 1) {
+      dimension_map.insert({dimension, non_degen_dimension});
+      non_degen_dimension++;
+    }
+  }
+  std::vector<int64_t> adjusted_dimensions;
+  for (int64_t dimension : dimensions) {
+    auto non_degenerate_dimension = dimension_map.find(dimension);
+    if (non_degenerate_dimension != dimension_map.end()) {
+      adjusted_dimensions.emplace_back(non_degenerate_dimension->second);
+    }
+  }
+  return adjusted_dimensions;
+}
+
+// Returns the dimensions of broadcast or reduction instructions, adjusted for
+// the removal of any degenerate dimensions in the output or input.
+std::vector<int64_t> AdjustedDimensions(const HloInstruction* instr) {
+  Shape shape;
+  if (instr->opcode() == HloOpcode::kBroadcast) {
+    shape = instr->shape();
+  } else if (instr->opcode() == HloOpcode::kReduce) {
+    shape = instr->operand(0)->shape();
+  } else {
+    return {};
+  }
+  return AdjustedDimensions(shape, instr->dimensions());
+}
+
 // Returns whether the HLO Computation applied by instr calculates the sum of
 // the elements. When provided, compares reduce_dims to the dimensions of the
 // reduction.
@@ -179,12 +219,9 @@ bool AppliesAddReduce(const HloInstruction* instr,
   if (instr->opcode() != HloOpcode::kReduce) {
     return false;
   }
-  if (ShapeUtil::HasDegenerateDimensions(instr->operand(0)->shape())) {
-    VLOG(1) << "Reduction input must not have degenerate dimensions.";
-    return false;
-  }
+
   // Verify the dimensions of the reduction.
-  if (!reduce_dims.empty() && instr->dimensions() != reduce_dims) {
+  if (!reduce_dims.empty() && AdjustedDimensions(instr) != reduce_dims) {
     return false;
   }
 
@@ -275,12 +312,6 @@ bool FindTarget(const HloInstruction* custom_call, const HloInstruction* instr,
 std::vector<int64_t> MapDimensions(const Shape& original_shape,
                                    const Shape& reshaped_shape,
                                    const absl::Span<const int64_t> dimensions) {
-  // The original and reshaped shape must not have degenerate dimensions.
-  if (ShapeUtil::HasDegenerateDimensions(original_shape) ||
-      ShapeUtil::HasDegenerateDimensions(reshaped_shape)) {
-    return {};
-  }
-
   auto dimension_product =
       [](const Shape& shape,
          absl::Span<const int64_t> product_dimensions) -> int64_t {
@@ -296,8 +327,9 @@ std::vector<int64_t> MapDimensions(const Shape& original_shape,
   for (int64_t original_dimension = 0, reshaped_dimension = 0;
        original_dimension < original_shape.rank(); ++original_dimension) {
     original_dimensions.emplace_back(original_dimension);
-    while (dimension_product(reshaped_shape, reshaped_dimensions) <
-               dimension_product(original_shape, original_dimensions) &&
+    while ((reshaped_dimensions.empty() ||
+            dimension_product(reshaped_shape, reshaped_dimensions) <
+                dimension_product(original_shape, original_dimensions)) &&
            reshaped_dimension < reshaped_shape.rank()) {
       reshaped_dimensions.emplace_back(reshaped_dimension++);
     }
@@ -554,12 +586,12 @@ auto Expectation(Pattern pattern) {
 // Expected value, or mean, with optional broadcast.
 template <typename Pattern>
 auto Expectation(UniqueHloInstruction* expectation, Pattern pattern) {
-  auto shared_subpattern =
-      MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()), AddReduce(pattern))
+  auto shared_subpattern = OptionalSupportedTransform(
+      m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()), AddReduce(pattern))
           .WithPredicate([](const HloInstruction* instr) {
             return CalculatesExpectation(instr);
           })
-          .WithPredicate(expectation->capture_or_verify);
+          .WithPredicate(expectation->capture_or_verify));
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
@@ -568,12 +600,13 @@ auto Expectation(UniqueHloInstruction* expectation, Pattern pattern) {
 template <typename Pattern>
 auto Expectation(UniqueHloInstruction* expectation, HloInstruction** reduce,
                  Pattern pattern) {
-  auto shared_subpattern = MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
-                                            AddReduce(reduce, pattern))
-                               .WithPredicate([](const HloInstruction* instr) {
-                                 return CalculatesExpectation(instr);
-                               })
-                               .WithPredicate(expectation->capture_or_verify);
+  auto shared_subpattern = OptionalSupportedTransform(
+      m::MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
+                          AddReduce(reduce, pattern))
+          .WithPredicate([](const HloInstruction* instr) {
+            return CalculatesExpectation(instr);
+          })
+          .WithPredicate(expectation->capture_or_verify));
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
@@ -583,14 +616,19 @@ auto Expectation(UniqueHloInstruction* expectation, HloInstruction** reduce,
 auto Variance(UniqueHloInstruction* variance, UniqueHloInstruction* expectation,
               UniqueHloInstruction* x) {
   return m::AnyOf<HloInstruction>(
-      Subtract(Expectation(Square(m::Op().WithPredicate(x->capture_or_verify))),
-               Square(Expectation(expectation,
-                                  m::Op().WithPredicate(x->capture_or_verify))))
+      Subtract(Expectation(Square(OptionalSupportedTransform(
+                   m::Op().WithPredicate(x->capture_or_verify)))),
+               Square(Expectation(expectation, OptionalSupportedTransform(
+                                                   m::Op().WithPredicate(
+                                                       x->capture_or_verify)))))
           .WithPredicate(variance->capture_or_verify),
       Expectation(
-          Square(Subtract(m::Op().WithPredicate(x->capture_or_verify),
-                          Expectation(expectation, m::Op().WithPredicate(
-                                                       x->capture_or_verify)))))
+          Square(Subtract(
+              OptionalSupportedTransform(
+                  m::Op().WithPredicate(x->capture_or_verify)),
+              Expectation(expectation,
+                          OptionalSupportedTransform(
+                              m::Op().WithPredicate(x->capture_or_verify))))))
           .WithPredicate(variance->capture_or_verify));
 }
 
@@ -727,8 +765,8 @@ auto XCenter(UniqueHloInstruction* x_center, UniqueHloInstruction* x,
              UniqueHloInstruction* fused_expectation,
              UniqueHloInstruction* custom_call,
              const NormMetadataMap& norm_metadata) {
-  auto capture_or_verify_x = [x, x_center, custom_call, &norm_metadata](
-                                 const HloInstruction* instr) -> bool {
+  auto capture_or_verify_x =
+      [x, custom_call, &norm_metadata](const HloInstruction* instr) -> bool {
     return x->CaptureOrVerify(
         FindTarget(custom_call->Instr(), instr->operand(0),
                    custom_call->Instr()->operand(0), norm_metadata)
@@ -854,9 +892,11 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
     if (Match(
             instr,
             SubtractMultiplyAddAnyOrder(
-                m::Op().WithPredicate(x.capture_or_verify),
+                OptionalSupportedTransform(
+                    m::Op().WithPredicate(x.capture_or_verify)),
                 Expectation(&expectation, &reduce,
-                            m::Op().WithPredicate(x.capture_or_verify)),
+                            OptionalSupportedTransform(
+                                m::Op().WithPredicate(x.capture_or_verify))),
                 NormFactor(&norm_factor, &x, &variance, &expectation, &epsilon),
                 m::Broadcast(&broadcast_scale, m::Op(&scale)),
                 m::Broadcast(&broadcast_bias, m::Op(&bias))))) {
@@ -889,11 +929,6 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
         return absl::OkStatus();
       }
 
-      // Skip initial convert, if present.
-      if (x.Instr()->opcode() == HloOpcode::kConvert) {
-        x.SetInstr(x.Instr()->mutable_operand(0));
-      }
-
       // Verify the input and output layouts.
       // TODO(philipphack): Consider supporting more general cases.
       if (!LayoutUtil::IsMonotonicWithDim0Major(x.Instr()->shape().layout()) ||
@@ -914,10 +949,15 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify that the shapes of scale and bias are compatible with the
-      // operation.
+      // operation. The adjusted norm dimensions are the dimensions of the
+      // reduction after removing any degenerate dimensions from the input of
+      // the reduction.
       std::vector<int64_t> norm_dims(reduce->dimensions().begin(),
                                      reduce->dimensions().end());
-      if (norm_dims.size() != scale->shape().dimensions_size()) {
+      std::vector<int64_t> norm_dims_adjusted = AdjustedDimensions(reduce);
+      if (norm_dims_adjusted.size() !=
+          ShapeUtil::DropDegenerateDimensions(scale->shape())
+              .dimensions_size()) {
         VLOG(1) << "Layer norm input dimensions not supported.";
         return absl::OkStatus();
       }
@@ -930,12 +970,14 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify the broadcasts of scale and bias.
-      if (!ShapeUtil::EqualIgnoringElementType(reduce->operand(0)->shape(),
-                                               broadcast_scale->shape()) ||
-          !ShapeUtil::EqualIgnoringElementType(reduce->operand(0)->shape(),
-                                               broadcast_bias->shape()) ||
-          reduce->dimensions() != broadcast_scale->dimensions() ||
-          reduce->dimensions() != broadcast_bias->dimensions()) {
+      if (!ShapeUtil::EqualIgnoringElementType(
+              ShapeUtil::DropDegenerateDimensions(reduce->operand(0)->shape()),
+              ShapeUtil::DropDegenerateDimensions(broadcast_scale->shape())) ||
+          !ShapeUtil::EqualIgnoringElementType(
+              ShapeUtil::DropDegenerateDimensions(reduce->operand(0)->shape()),
+              ShapeUtil::DropDegenerateDimensions(broadcast_bias->shape())) ||
+          norm_dims_adjusted != AdjustedDimensions(broadcast_scale) ||
+          norm_dims_adjusted != AdjustedDimensions(broadcast_bias)) {
         VLOG(1) << "Layer norm operand broadcast not supported.";
         return absl::OkStatus();
       }
@@ -949,6 +991,9 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
           non_norm_dims.emplace_back(x_dim);
         }
       }
+      std::vector<int64_t> non_norm_dims_adjusted =
+          AdjustedDimensions(x.Instr()->shape(), non_norm_dims);
+
       std::vector<int64_t> x_transpose_order = non_norm_dims;
       x_transpose_order.insert(x_transpose_order.end(), norm_dims.begin(),
                                norm_dims.end());
@@ -1052,9 +1097,10 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
 
       // Store metadata for potential use in the backward graph.
       norm_metadata_.insert(
-          {custom_call, NormMetadata({x_transpose.value_or(nullptr),
-                                      y_transpose.value_or(nullptr), norm_dims,
-                                      non_norm_dims})});
+          {custom_call,
+           NormMetadata({x_transpose.value_or(nullptr),
+                         y_transpose.value_or(nullptr), norm_dims_adjusted,
+                         non_norm_dims_adjusted})});
 
       VLOG(1) << "Layer norm rewritten into Custom Call.";
 
@@ -1255,10 +1301,14 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify the dimensions of reductions in the backward graph.
-      if (reduce0->dimensions() != norm_metadata->second.norm_dims ||
-          reduce1->dimensions() != norm_metadata->second.norm_dims ||
-          reduce2->dimensions() != norm_metadata->second.norm_dims ||
-          reduce3->dimensions() != norm_metadata->second.norm_dims) {
+      if (AdjustedDimensions(reduce0) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce1) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce2) !=
+              norm_metadata->second.norm_dims_adjusted ||
+          AdjustedDimensions(reduce3) !=
+              norm_metadata->second.norm_dims_adjusted) {
         VLOG(1) << "Unexpected reductions dimensions in layer norm gradient.";
         return absl::OkStatus();
       }
@@ -1303,8 +1353,9 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
                                        m::Op().Is(factor1.Instr()))))) {
               // Dscale is an addition-reduction of the product.
               for (HloInstruction* multiply_user : user->users()) {
-                if (AppliesAddReduce(multiply_user,
-                                     norm_metadata->second.non_norm_dims)) {
+                if (AppliesAddReduce(
+                        multiply_user,
+                        norm_metadata->second.non_norm_dims_adjusted)) {
                   return multiply_user;
                 }
               }
@@ -1321,7 +1372,8 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
 
       // Find Dbias, i.e. an addition-reduction of DY, starting from DY.
       // Rewriting proceeds without fusing Dbias if unsuccessful.
-      dbias = FindAddReduce(dy.Instr(), norm_metadata->second.non_norm_dims);
+      dbias = FindAddReduce(dy.Instr(),
+                            norm_metadata->second.non_norm_dims_adjusted);
 
       // Verify the input and output layouts.
       // TODO(philipphack): Consider supporting more general cases.
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
index 193b9f19d76250..f598e46da9accb 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
@@ -105,7 +105,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4]) -> f32[2,4] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4]) -> f32[2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -174,7 +174,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -193,6 +193,75 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D3Degenerate0) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[1,4,6,8] parameter(0)
+        input_square = f32[1,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[1,4,6] reduce(input_square, c0), dimensions={3}, to_apply=apply
+        r_nelems = f32[] constant(0.125)
+        r_nelems_bcast = f32[1,4,6] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[1,4,6] multiply(input_square_sum, r_nelems_bcast)
+        input_sum = f32[1,4,6] reduce(input, c0), dimensions={3}, to_apply=apply
+        input_mean = f32[1,4,6] multiply(input_sum, r_nelems_bcast)
+        input_mean_square = f32[1,4,6] multiply(input_mean, input_mean)
+        variance = f32[1,4,6] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[1,4,6] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[1,4,6] add(variance, epsilon_bcast)
+        norm_factor = f32[1,4,6] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[1,4,6,8] broadcast(norm_factor), dimensions={0,1,2}
+        input_mean_bcast = f32[1,4,6,8] broadcast(input_mean), dimensions={0,1,2}
+        input_center = f32[1,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[1,4,6,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[8] parameter(1)
+        scale_bcast = f32[1,4,6,8] broadcast(scale), dimensions={3}
+        norm_scale = f32[1,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[8] parameter(2)
+        bias_bcast = f32[1,4,6,8] broadcast(bias), dimensions={3}
+        ROOT out = f32[1,4,6,8] add(norm_scale, bias_bcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[1,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> f32[1,4,6,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[1,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[24,8,1,1]{3,2,1,0} bitcast([[P0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[24,8,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[24,8,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT [[GTE_BITCAST:%[^ ]+]] = f32[1,4,6,8]{3,2,1,0} bitcast([[GTE]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -243,7 +312,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[6], bias: f32[6]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -263,6 +332,76 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,1,6,8] parameter(0)
+        input_square = f32[2,1,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,1,8] reduce(input_square, c0), dimensions={2}, to_apply=apply
+        r_nelems = f32[] constant(0.166667)
+        r_nelems_bcast = f32[2,1,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,1,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,1,8] reduce(input, c0), dimensions={2}, to_apply=apply
+        input_mean = f32[2,1,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,1,8] multiply(input_mean, input_mean)
+        variance = f32[2,1,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,1,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,1,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,1,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,1,6,8] broadcast(norm_factor), dimensions={0,1,3}
+        input_mean_bcast = f32[2,1,6,8] broadcast(input_mean), dimensions={0,1,3}
+        input_center = f32[2,1,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,1,6,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[6] parameter(1)
+        scale_bcast = f32[2,1,6,8] broadcast(scale), dimensions={2}
+        norm_scale = f32[2,1,6,8] multiply(norm, scale_bcast)
+        bias = f32[6] parameter(2)
+        bias_broadcast = f32[2,1,6,8] broadcast(bias), dimensions={2}
+        ROOT out = f32[2,1,6,8] add(norm_scale, bias_broadcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,1,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,1,6,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,6]{3,2,1,0} transpose([[P0]]), dimensions={1,0,3,2}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT [[FUSION:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} fusion([[GTE]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -313,7 +452,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6]) -> f32[2,4,6,8] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -333,6 +472,76 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean, input_mean)
+        variance = f32[2,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_broadcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        ROOT out = f32[2,4,1,8] add(norm_scale, bias_broadcast)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> f32[2,4,1,8] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:  ROOT  [[FUSION:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} fusion([[GTE]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNorm4D3IncorrectScaleBroadcast) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -383,7 +592,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D3IncorrectScaleBroadcast) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,2,2,2], scale: f32[2], bias: f32[2]) -> f32[2,2,2,2] {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,2,2,2], {{.*}}: f32[2], {{.*}}: f32[2]) -> f32[2,2,2,2] {
 ; CHECK-NOT:           custom_call_target="__cudnn$norm"
   )";
 
@@ -442,7 +651,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4]) -> (f32[2,4], f32[2], f32[2], f32[2]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4]) -> (f32[2,4], f32[2], f32[2], f32[2]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -519,7 +728,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8]) -> (f32[2,4,6,8], f32[2,4,6], f32[2,4,6], f32[2,4,6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8]) -> (f32[2,4,6,8], f32[2,4,6], f32[2,4,6], f32[2,4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -596,7 +805,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
@@ -622,6 +831,84 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean, input_mean)
+        variance = f32[2,8] subtract(input_square_mean, input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(norm_factor_bcast, input_center)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_broadcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        norm_scale_bias = f32[2,4,1,8] add(norm_scale, bias_broadcast)
+        norm_factor_cube = f32[2,8] divide(norm_factor, variance_plus_epsilon)
+        ROOT out = (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) tuple(norm_scale_bias, input_mean, norm_factor, norm_factor_cube)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK:           }
+; CHECK-NEXT:    [[GTE0:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
+; CHECK-NEXT:    [[FUSION0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} fusion([[GTE0]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-NEXT:    [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=1
+; CHECK-NEXT:    [[GTE1_BITCAST:%[^ ]+]] = f32[2,8]{1,0} bitcast([[GTE1]])
+; CHECK-NEXT:    [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=2
+; CHECK-NEXT:    [[GTE2_BITCAST:%[^ ]+]] = f32[2,8]{1,0} bitcast([[GTE2]])
+; CHECK-NEXT:    [[FUSION1:%[^ ]+]] = f32[2,8]{1,0} fusion([[GTE2]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
+; CHECK-NEXT:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,8]{1,0}, f32[2,8]{1,0}, f32[2,8]{1,0}) tuple([[FUSION0]], [[GTE1_BITCAST]], [[GTE2_BITCAST]], [[FUSION1]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -700,7 +987,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4], doutput: f32[2,4]) -> (f32[2,4], f32[2,4], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4]) -> (f32[2,4], f32[2,4], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
@@ -815,7 +1102,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[8], {{.*}}: f32[8], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
@@ -930,7 +1217,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[6], bias: f32[6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1048,7 +1335,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1088,6 +1375,124 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,1,8] parameter(0)
+        input_square = f32[2,4,1,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean,input_mean)
+        variance = f32[2,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,1,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,1,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,1,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,1,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[4,1] parameter(1)
+        scale_bcast = f32[2,4,1,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,1,8] multiply(norm, scale_bcast)
+        bias = f32[4,1] parameter(2)
+        bias_bcast = f32[2,4,1,8] broadcast(bias), dimensions={1,2}
+        norm_scale_bias = f32[2,4,1,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,1,8] parameter(3)
+        dbias = f32[4,1] reduce(doutput, c0), dimensions={0,3}, to_apply=apply
+        norm_doutput = f32[2,4,1,8] multiply(norm, doutput)
+        dscale = f32[4,1] reduce(norm_doutput, c0), dimensions={0,3}, to_apply=apply
+        scale_doutput = f32[2,4,1,8] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4,1,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,8] reduce(input_center_scale_doutput, c0), dimensions={1,2}, to_apply=apply
+        norm_factor_cube = f32[2,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.5)
+        c2_bcast = f32[2,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,1,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,3}
+        f1 = f32[2,4,1,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,1,8] negate(f1)
+        minus_f1_sum = f32[2,8] reduce(minus_f1, c0), dimensions={1,2}, to_apply=apply
+        f2 = f32[2,4,1,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,1,8] negate(f2)
+        minus_f2_sum = f32[2,8] reduce(minus_f2, c0), dimensions={1,2}, to_apply=apply
+        minus_f1_f2_sum = f32[2,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,1,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,3}
+        f1_f2 = f32[2,4,1,8] add(f1, f2)
+        dinput = f32[2,4,1,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1], {{.*}}: f32[2,4,1,8]) -> (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P0]]), dimensions={2,0,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,1]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(3)
+; CHECK-NEXT:    [[TRANSPOSE1:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} transpose([[P3]]), dimensions={2,0,3,1}
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION0:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=0
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} get-tuple-element([[FUSION0]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4,1]{1,0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,1,8]{3,2,1,0}, f32[2,4,1,8]{3,2,1,0}, f32[4,1]{1,0}, f32[4,1]{1,0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 #if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
   GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
@@ -1167,7 +1572,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,48]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4,48]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
@@ -1286,7 +1691,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
 
   const char* optimized_hlo = R"(
 
-; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,6,2,2,2]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4], {{.*}}: f32[4], {{.*}}: f32[2,4,6,2,2,2]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
 ; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index aad23d161985b6..e9fe12d6a7c4c5 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/ffi/ffi.h"
@@ -614,6 +615,41 @@ TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
   TF_ASSERT_OK(Execute(&b, {}).status());
 }
 
+//===----------------------------------------------------------------------===//
+// XLA:FFI handler for testing attributes decoding
+//===----------------------------------------------------------------------===//
+
+static absl::Status FfiAttributes(ffi::BufferBase,
+                                  absl::Span<const int32_t> i32_arr) {
+  if (i32_arr.size() != 4)
+    return absl::InternalError("i32_arr size does not match");
+
+  if (i32_arr[0] != 1 || i32_arr[1] != 2 || i32_arr[2] != 3 || i32_arr[3] != 4)
+    return absl::InternalError("i32_arr values do not match");
+
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()
+                           .Attr<absl::Span<const int32_t>>("i32_arr"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
+                         PLATFORM, kFfiAttributes);
+
+TEST_F(CustomCallTest, FfiAttributes) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "xla.gpu.ffi_attributes", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/"{ i32_arr = array<i32: 1, 2, 3, 4> }",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK(Execute(&b, {}).status());
+}
+
 //===----------------------------------------------------------------------===//
 // XLA:FFI handler with attached HloComputation
 //===----------------------------------------------------------------------===//
@@ -631,6 +667,10 @@ static absl::Status MemcpyWithCalledComputation(
   if (!DynCast<HloParameterInstruction>(called_computation->root_instruction()))
     return absl::InternalError("ROOT must be a paremeter");
 
+  // Check that scratch allocator is working.
+  auto scratch = scratch_allocator.AllocateBytes(1024);
+  if (!scratch.ok()) return scratch.status();
+
   return Memcpy(stream, src, dst);
 }
 
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/executable.proto
index 970d57f0604dd0..e66c48c4762b2d 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/executable.proto
@@ -44,4 +44,5 @@ message CompilationResultProto {
   BufferAssignmentProto buffer_assignment = 2;
   string asm_text = 3;
   bytes binary = 4;
+  map<string, string> dnn_compiled_graphs = 5;
 }
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
deleted file mode 100644
index 9c8c4264ad6f57..00000000000000
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/fusion_merger_triton.h"
-
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/gpu_fusible.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/triton_fusion_analysis.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
-
-namespace xla::gpu {
-namespace {
-
-// Taking in a producer HloFusionInstruction, tries to merge into consumer
-// triton softmax fusion.
-// The following is assumed:
-//  * The producer is an HloFusionInstruction
-//  * The (sole) consumer of the producer is a triton softmax fusion
-//
-// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
-// triton softmax instruction if the producer was successfully merged into the
-// consumer. If the merge was unsuccessful, the original computation remains
-// unchanged and a nullopt is returned.
-std::optional<HloFusionInstruction*>
-TryMergeFusionProducerIntoTritonSoftmaxConsumer(
-    HloFusionInstruction* producer) {
-  // TODO(b/313026024): Add support for multiple users
-  CHECK_EQ(producer->user_count(), 1);
-
-  HloComputation* computation = producer->parent();
-  HloModule* parent_module = computation->parent();
-  HloInstruction* original_softmax_instruction = producer->users().front();
-  CHECK_EQ(original_softmax_instruction->opcode(), HloOpcode::kFusion);
-
-  std::unique_ptr<HloInstruction> candidate =
-      original_softmax_instruction->Clone();
-  HloInstruction* candidate_fusion = candidate.get();
-
-  // Try to merge the producer into candidate fusion.
-  candidate_fusion->MergeFusionInstruction(producer);
-
-  HloComputation* fused_computation =
-      candidate_fusion->called_computations().front();
-
-  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
-
-  if (!analysis.ok()) {
-    return std::nullopt;
-  }
-
-  computation->AddInstruction(std::move(candidate));
-
-  if (original_softmax_instruction->IsRoot()) {
-    computation->set_root_instruction(candidate_fusion);
-  }
-
-  TF_CHECK_OK(
-      original_softmax_instruction->ReplaceAllUsesWith(candidate_fusion));
-
-  HloComputation* original_softmax_computation =
-      original_softmax_instruction->fused_instructions_computation();
-  TF_CHECK_OK(computation->RemoveInstruction(original_softmax_instruction));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
-
-  CHECK_EQ(0, producer->user_count()) << producer->ToString();
-  HloComputation* original_producer_computation =
-      producer->fused_instructions_computation();
-  TF_CHECK_OK(computation->RemoveInstruction(producer));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_producer_computation));
-
-  return Cast<HloFusionInstruction>(candidate_fusion);
-}
-
-// Taking in a consumer HloFusionInstruction and a HloInstruction for a triton
-// softmax fusion, tries to merge the consumer fusion into the softmax fusion.
-// The following is assumed:
-//  * The consumer is an HloFusionInstruction
-//  * consumer->shape().IsArray(), i.e. not a multi-output consumer
-//  * The original_softmax_instr is a triton softmax fusion
-//  * The consumer is the sole user of original_softmax_instr
-//
-// Returns std::optional<HloFusionInstruction*>, pointing to the new (fused)
-// triton softmax instruction if the consumer was successfully merged into the
-// producer. If the merge was unsuccessful, the original computation remains
-// unchanged and a nullopt is returned.
-std::optional<HloFusionInstruction*>
-TryMergeFusionConsumerIntoTritonSoftmaxProducer(
-    HloFusionInstruction* consumer,
-    HloFusionInstruction* original_softmax_instr) {
-  CHECK_EQ(original_softmax_instr->opcode(), HloOpcode::kFusion);
-  CHECK_EQ(original_softmax_instr->user_count(), 1);
-  CHECK_EQ(original_softmax_instr->users().front(), consumer);
-  CHECK(consumer->shape().IsArray());
-  CHECK_OK(original_softmax_instr->backend_config<GpuBackendConfig>());
-  CHECK_EQ(original_softmax_instr->backend_config<GpuBackendConfig>()
-               ->fusion_backend_config()
-               .kind(),
-           kTritonSoftmaxFusionKind);
-  HloComputation* parent_computation = consumer->parent();
-  HloModule* parent_module = parent_computation->parent();
-
-  // We clone the consumer to generate a candidate that we fuse into.
-  std::unique_ptr<HloInstruction> candidate_instr_ptr = consumer->Clone();
-  HloInstruction* consumer_candidate_instr = candidate_instr_ptr.get();
-
-  // Try to merge the producer into candidate fusion.
-  consumer_candidate_instr->MergeFusionInstruction(original_softmax_instr);
-  HloComputation* fused_computation =
-      consumer_candidate_instr->fused_instructions_computation();
-
-  const auto analysis = TritonFusionAnalysis::Execute(*fused_computation);
-
-  if (!analysis.ok()) {
-    return std::nullopt;
-  }
-
-  // We want our joined fusion to have the correct fusion_kind, backend_config,
-  // etc for a triton fusion. So we assemble a new instruction rather than
-  // using consumer_candidate_instr, which would not get triton codegen'd.
-  std::unique_ptr<HloInstruction> new_softmax_instr_ptr =
-      HloInstruction::CreateFusion(
-          /*shape=*/consumer_candidate_instr->shape(),
-          /*fusion_kind=*/original_softmax_instr->fusion_kind(),
-          /*operands=*/consumer_candidate_instr->operands(),
-          /*fusion_computation=*/fused_computation,
-          /*prefix=*/"triton_softmax_");
-
-  HloInstruction* new_softmax_instr = new_softmax_instr_ptr.get();
-
-  new_softmax_instr->CopyBackendConfigFrom(original_softmax_instr);
-
-  // Now, we incorporate new_softmax_instr into our module.
-  parent_computation->AddInstruction(std::move(new_softmax_instr_ptr));
-
-  if (consumer->IsRoot()) {
-    parent_computation->set_root_instruction(new_softmax_instr);
-  }
-
-  TF_CHECK_OK(consumer->ReplaceAllUsesWith(new_softmax_instr));
-
-  // Remove the replaced instructions and computations from the module.
-  HloComputation* original_consumer_computation =
-      consumer->fused_instructions_computation();
-  TF_CHECK_OK(parent_computation->RemoveInstruction(consumer));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_consumer_computation));
-
-  CHECK_EQ(0, original_softmax_instr->user_count());
-
-  // Keep a ptr to the original computation so we can remove it from the module.
-  HloComputation* original_softmax_computation =
-      original_softmax_instr->fused_instructions_computation();
-
-  TF_CHECK_OK(parent_computation->RemoveInstruction(original_softmax_instr));
-  TF_CHECK_OK(
-      parent_module->RemoveEmbeddedComputation(original_softmax_computation));
-
-  return Cast<HloFusionInstruction>(new_softmax_instr);
-}
-
-bool TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(
-    HloFusionInstruction* softmax_fusion) {
-  // The softmax_fusion should come directly from the matcher. They might have
-  // more than a single operand, in this case attempt to fuse into the first
-  // operand only.
-  if (softmax_fusion->operand_count() > 1) {
-    LOG(INFO) << "More than one parameter detected. Will attempt to merge "
-                 "fusions only for operand 0 (diamond producer).";
-  }
-
-  // TODO(b/313026024): Add support for multiple users
-  bool should_try_merging_producer =
-      softmax_fusion->operand(0)->user_count() == 1 &&
-      softmax_fusion->operand(0)->opcode() == HloOpcode::kFusion;
-  // TODO(b/315040476): generalize for multiple users and multi-output
-  bool should_try_merging_consumer =
-      softmax_fusion->user_count() == 1 &&
-      softmax_fusion->users().front()->opcode() == HloOpcode::kFusion &&
-      softmax_fusion->users().front()->shape().IsArray();
-
-  bool changed = false;
-  if (should_try_merging_producer) {
-    HloFusionInstruction* producer =
-        Cast<HloFusionInstruction>(softmax_fusion->mutable_operand(0));
-
-    VLOG(6) << "Fusing producer " << producer->ToShortString() << " into "
-            << softmax_fusion->ToShortString();
-
-    std::optional<HloFusionInstruction*> result =
-        TryMergeFusionProducerIntoTritonSoftmaxConsumer(producer);
-
-    if (!result.has_value()) {
-      VLOG(6) << "Did not fuse producer into "
-              << softmax_fusion->ToShortString();
-    } else {
-      softmax_fusion = result.value();
-      changed = true;
-    }
-  }
-
-  if (should_try_merging_consumer) {
-    HloFusionInstruction* consumer =
-        Cast<HloFusionInstruction>(softmax_fusion->users().front());
-
-    VLOG(6) << "Fusing consumer " << consumer->ToShortString() << " into "
-            << softmax_fusion->ToShortString();
-
-    std::optional<HloFusionInstruction*> result =
-        TryMergeFusionConsumerIntoTritonSoftmaxProducer(consumer,
-                                                        softmax_fusion);
-
-    if (!result.has_value()) {
-      VLOG(6) << "Did not fuse consumer into "
-              << softmax_fusion->ToShortString();
-    } else {
-      softmax_fusion = result.value();
-      changed = true;
-    }
-  }
-  return changed;
-}
-
-}  // anonymous namespace
-
-absl::StatusOr<bool> FusionMergerTriton::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  int fused_comps = 0;
-  for (HloComputation* comp :
-       module->MakeNonfusionComputations(execution_threads)) {
-    if (comp->IsCustomCallComputation()) {
-      continue;
-    }
-
-    for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
-      if (!IsTritonSoftmaxFusion(*instr)) continue;
-
-      VLOG(6) << "Matched triton_softmax fusion: " << instr->ToShortString();
-
-      HloFusionInstruction* softmax = Cast<HloFusionInstruction>(instr);
-
-      bool result =
-          TryMergeProducerAndConsumerFusionsIntoTritonSoftmax(softmax);
-
-      if (!result) {
-        VLOG(6) << "Did not fuse producer or consumer into "
-                << instr->ToShortString();
-      } else {
-        ++fused_comps;
-      }
-    }
-  }
-  return fused_comps > 0;
-}
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc b/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
deleted file mode 100644
index 20a21807e95997..00000000000000
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton_test.cc
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/fusion_merger_triton.h"
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "xla/autotune_results.pb.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/pattern_matcher_gmock.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/status_matchers.h"
-
-using ::tsl::testing::IsOk;
-using ::tsl::testing::IsOkAndHolds;
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace m = ::xla::match;
-using FusionMergerTritonTest = HloTestBase;
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeTritonFusionWithSingleParameterProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
-  EXPECT_THAT(verifier().Run(module.get()), IsOk());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeWithTwoParameterConsumer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-consumer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  param_1 = f32[125]{0} parameter(1)
-  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(
-    FusionMergerTritonTest,
-    CanMergeProducerFusionIntoTritonSoftmaxConsumerWhenTheConsumerIsNotRoot) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=auxiliary_computation
-  triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT broadcast = f32[10,125,127]{2,1,0} broadcast(triton_softmax), dimensions={1,2}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_THAT(fusion_merger.Run(module.get()), IsOkAndHolds(true));
-  EXPECT_THAT(verifier().Run(module.get()), IsOk());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Broadcast(m::Fusion(m::Parameter()))));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeTritonFusionWithMultipleParameterProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithTransposeProducer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[127,125]{1,0} parameter(1)
-  transpose = f32[125,127]{1,0} transpose(parameter_1), dimensions={1,0}
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(transpose, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[127,125]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       DoesNotMergeTritonFusionWithProducerContainingUntileableOp) {
-  // Right now, concatenate is not tileable.
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  parameter_0 = f32[125,63]{1,0} parameter(0)
-  parameter_1 = f32[125,64]{1,0} parameter(1)
-  ROOT concatenate = f32[125,127]{1,0} concatenate(parameter_0, parameter_1), dimensions={1}
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,63]{1,0} parameter(0)
-  param_1 = f32[125,64]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=auxiliary_computation
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Fusion(m::Parameter(), m::Parameter()))));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeTritonFusionWithElementwiseProducer) {
-  const std::string kHloText = R"(
-HloModule layernorm
-
-add_f32 {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add_6 = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_fusion {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply_1 = f32[125,127]{1,0} multiply(parameter_0, parameter_1)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  constant_0 = f32[] constant(0)
-  reduce = f32[125]{0} reduce(parameter_0, constant_0), dimensions={1}, to_apply=add_f32
-  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
-  ROOT multiply_result = f32[125,127]{1,0} multiply(parameter_0, broadcast)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  auxiliary_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_fusion
-  ROOT triton_softmax = f32[125,127]{1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-}
-
-)";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       DoesNotMergeSoftmaxWithParamBroadcastedAlongBatchAndReduceDimensions) {
-  const std::string kHloText = R"(
-HloModule t
-
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-auxiliary_computation {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  param_1 = f32[10]{0} parameter(1)
-  broadcast_0 = f32[10,125,127]{2,1,0} broadcast(param_1), dimensions={0}
-  ROOT multiply_0 = f32[10,125,127]{2,1,0} multiply(param_0, broadcast_0)
-}
-
-triton_softmax_computation {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  multiply = f32[10,125,127]{2,1,0} multiply(param_0, param_0)
-  constant = f32[] constant(0)
-  reduce = f32[10,125]{1,0} reduce(multiply, constant), dimensions={2}, to_apply=add
-  broadcast = f32[10,125,127]{2,1,0} broadcast(reduce), dimensions={0,1}
-  ROOT multiply_out = f32[10,125,127]{2,1,0} multiply(param_0, broadcast)
-}
-
-ENTRY main {
-  param_0 = f32[10,125,127]{2,1,0} parameter(0)
-  param_1 = f32[10]{0} parameter(1)
-  auxiliary_fusion = f32[10,125,127]{2,1,0} fusion(param_0, param_1), kind=kCustom, calls=auxiliary_computation
-  ROOT triton_softmax = f32[10,125,127]{2,1,0} fusion(auxiliary_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-}
-)";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Fusion())));
-}
-
-TEST_F(FusionMergerTritonTest, CanMergeWithBothProducerAndConsumerFusions) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_1, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeWithMultiInputProducerAndConsumerFusions) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT add = f32[125,127]{1,0} add(parameter_1, broadcast)
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  param_1 = f32[125,127]{1,0} parameter(1)
-  param_2 = f32[125,127]{1,0} parameter(2)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0, param_1), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_2, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Fusion(m::Parameter(), m::Parameter(), m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest,
-       CanMergeWithBothProducerAndConsumerFusionsSharingParameter) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(parameter_1, broadcast)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125]{0} parameter(0)
-  producer_fusion = f32[125,127]{1,0} fusion(param_0), kind=kLoop, calls=producer_computation
-  triton_softmax = f32[125,127]{1,0} fusion(producer_fusion), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = f32[125,127]{1,0} fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger{};
-  EXPECT_TRUE(fusion_merger.Run(module.get()).value());
-  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter())));
-}
-
-TEST_F(FusionMergerTritonTest, DoesNotMergeSoftmaxWithMultiOutputConsumer) {
-  const std::string kHloText = R"(
-HloModule t
-add {
-  Arg_0 = f32[] parameter(0)
-  Arg_1 = f32[] parameter(1)
-  ROOT add = f32[] add(Arg_0, Arg_1)
-}
-
-producer_computation {
-  parameter_0 = f32[125]{0} parameter(0)
-  ROOT broadcast = f32[125,127]{1,0} broadcast(parameter_0), dimensions={0}
-}
-
-consumer_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  parameter_1 = f32[125,127]{1,0} parameter(1)
-  add = f32[125,127]{1,0} add(parameter_1, parameter_0)
-  multiply = f32[125,127]{1,0} multiply(parameter_1, parameter_0)
-  ROOT tuple = (f32[125,127]{1,0}, f32[125,127]{1,0}) tuple(add, multiply)
-}
-
-triton_softmax_computation {
-  parameter_0 = f32[125,127]{1,0} parameter(0)
-  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
-  constant_0 = f32[] constant(0)
-  reduce_0 = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
-  broadcast_4 = f32[125,127]{1,0} broadcast(reduce_0), dimensions={0}
-  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast_4)
-}
-
-ENTRY main {
-  param_0 = f32[125,127]{1,0} parameter(0)
-  triton_softmax = f32[125,127]{1,0} fusion(param_0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config": {"kind":"__triton_softmax"}}
-  ROOT consumer_fusion = (f32[125,127]{1,0}, f32[125,127]{1,0}) fusion(param_0, triton_softmax), kind=kLoop, calls=consumer_computation
-})";
-  auto module = ParseAndReturnVerifiedModule(kHloText).value();
-  FusionMergerTriton fusion_merger;
-  EXPECT_FALSE(fusion_merger.Run(module.get()).value());
-  VLOG(2) << module->ToString();
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Fusion(m::Parameter(), m::Fusion())));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 94e10f18bc4d55..2fbbcd17d47da1 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -1,8 +1,8 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -32,6 +32,69 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "in_place_dynamic_update_slice_test",
+    srcs = ["in_place_dynamic_update_slice_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":fusions",
+        ":in_place_dynamic_update_slice",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "in_place_dynamic_update_slice_mlir",
+    srcs = ["in_place_dynamic_update_slice_mlir.cc"],
+    hdrs = ["in_place_dynamic_update_slice_mlir.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/fusions/mlir:computation_partitioner",
+        "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
+        "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TensorDialect",
+    ],
+)
+
+xla_cc_test(
+    name = "in_place_dynamic_update_slice_mlir_test",
+    srcs = ["in_place_dynamic_update_slice_mlir_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":in_place_dynamic_update_slice_mlir",
+        ":mlir_emitter_test_base",
+        "//xla:error_spec",
+        "//xla/service:gpu_plugin",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
+
 cc_library(
     name = "copy",
     srcs = ["copy.cc"],
@@ -42,8 +105,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/runtime:copy_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -73,15 +136,18 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:custom_kernel_fusion",
+        "//xla/service/gpu/runtime:address_computation_thunk",
         "//xla/service/gpu/runtime:custom_call_thunk",
         "//xla/service/gpu/runtime:gemm_thunk",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
@@ -107,6 +173,7 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu:address_computation_fusion_rewriter",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -140,10 +207,10 @@ cc_library(
         "//xla/service/gpu:kernel_reuse_cache",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -173,6 +240,7 @@ cc_library(
         ":custom",
         ":fusion_emitter",
         ":in_place_dynamic_update_slice",
+        ":in_place_dynamic_update_slice_mlir",
         ":input_slices",
         ":input_slices_mlir",
         ":loop",
@@ -283,6 +351,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -328,6 +397,7 @@ cc_library(
         "//xla/service/gpu/fusions/mlir:elemental_hlo_to_mlir",
         "//xla/service/gpu/fusions/mlir:mlir_fusion_emitter",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -335,6 +405,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
@@ -442,9 +513,9 @@ cc_library(
     hdrs = ["scatter.h"],
     deps = [
         ":fusion_emitter",
+        ":loop",
         "//xla:shape_util",
         "//xla:status",
-        "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:ir_emission_utils",
@@ -452,6 +523,7 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
@@ -460,7 +532,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -472,6 +543,8 @@ xla_cc_test(
         ":scatter",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -560,7 +633,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu/runtime:cudnn_thunk",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -574,10 +646,16 @@ xla_test(
         "gpu",
     ],
     deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/service:executable",
         "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/tests:filecheck",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test_main",
     ],
 )
@@ -593,8 +671,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/runtime:memset_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
@@ -630,10 +708,10 @@ cc_library(
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu:target_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
@@ -745,6 +823,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
@@ -800,6 +880,25 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "concatenate_test",
+    srcs = ["concatenate_test.cc"],
+    deps = [
+        ":concatenate",
+        ":fusions",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:affine_map_printer",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "concatenate_mlir",
     srcs = ["concatenate_mlir.cc"],
@@ -820,6 +919,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:TensorDialect",
@@ -832,14 +932,14 @@ xla_cc_test(
     srcs = ["concatenate_mlir_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
+        ":concatenate",
         ":concatenate_mlir",
         ":mlir_emitter_test_base",
         "//xla:error_spec",
-        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 6fb3b14f9f77b0..2374f5abe6e2fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu/address_computation_fusion_rewriter.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
@@ -69,17 +70,9 @@ namespace {
 
 class AddressComputationFusionTest : public HloTestBase {
  public:
-  HloModuleConfig GetRefModuleConfig() {
+  HloModuleConfig GetModuleConfigWithoutCommandBuffer() {
     DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_address_computation_fusion(false);
-    HloModuleConfig config;
-    config.set_debug_options(debug_options);
-    return config;
-  }
-
-  HloModuleConfig GetOptModuleConfig() {
-    DebugOptions debug_options = GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+    debug_options.clear_xla_gpu_enable_command_buffer();
     HloModuleConfig config;
     config.set_debug_options(debug_options);
     return config;
@@ -157,11 +150,10 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
     %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[8,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -239,11 +231,10 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
     %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -318,11 +309,10 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
     %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
     %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -397,11 +387,10 @@ TEST_F(AddressComputationFusionTest, ContiguousSliceNonDefaultLayout) {
     %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
     %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
     ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
-        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -527,13 +516,12 @@ TEST_F(AddressComputationFusionTest, OperandIsSlicedGetTupleElement) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -613,13 +601,12 @@ TEST_F(AddressComputationFusionTest, ReversedOperandOrder) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -744,13 +731,12 @@ TEST_F(AddressComputationFusionTest, SingleOperandComputation) {
       calls=%address-computation,
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -835,13 +821,12 @@ TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
       output_to_operand_aliasing={{0}: (1, {})},
       backend_config={
         "fusion_backend_config":{
-          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
         }
       }
   })";
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
-                                      GetOptModuleConfig(), error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -885,33 +870,40 @@ TEST_F(AddressComputationFusionTest, CustomCallSimple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
-  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
-  hlo_config.set_debug_options(debug_options);
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
 
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
+                                      error_spec, /*run_hlo_passes=*/false));
 }
 
 static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
                                ffi::BufferBase src1, ffi::BufferBase src2,
                                ffi::BufferBase src3, ffi::BufferBase src4,
-                               ffi::BufferBase dst0, ffi::BufferBase dst1,
-                               ffi::BufferBase dst2, ffi::BufferBase dst3,
-                               ffi::BufferBase dst4) {
+                               ffi::BufferBase src5, ffi::BufferBase src6,
+                               ffi::BufferBase src7, ffi::BufferBase dst0,
+                               ffi::BufferBase dst1, ffi::BufferBase dst2,
+                               ffi::BufferBase dst3, ffi::BufferBase dst4,
+                               ffi::BufferBase dst5, ffi::BufferBase dst6) {
   //  src0:  param 0 at tuple index {0}, shape f32[128]
   //  src1:  param 0 at tuple index {1}, shape f32[256]
   //  src2:  param 1 at tuple index {0}, shape f32[1024]
   //  src3:  param 1 at tuple index {1}, shape f32[8]
   //  src4:  param 2, shape f32[4,8]
+  //  src5:  param 3 at tuple index {0, 0}, shape f32[32]
+  //  src6:  param 3 at tuple index {0, 1}, shape f32[64]
+  //  src7:  param 3 at tuple index {1}, shape f32[3,128]
   //
   //  dst0:  result at tuple index {0}, shape f32[8]
   //  dst1:  result at tuple index {1, 0}, shape f32[128]
   //  dst2:  result at tuple index {1, 1}, shape f32[256]
   //  dst3:  result at tuple index {2}, shape f32[1024]
   //  dst4:  result at tuple index {3}, shape f32[4,8]
+  //  dst5:  result at tuple index {4}, shape f32[3,128]
+  //  dst6:  result at tuple index {5}, shape f32[96]
 
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
@@ -923,6 +915,13 @@ static absl::Status SubBuffers(se::Stream* stream, ffi::BufferBase src0,
       stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
   TF_RETURN_IF_ERROR(
       stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst5.data, src7.data, 3 * 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst6.data, src6.data, 64 * sizeof(float)));
+  stream_executor::DeviceMemoryBase slice =
+      dst6.data.GetByteSlice(64 * sizeof(float), 32 * sizeof(float));
+  TF_RETURN_IF_ERROR(stream->MemcpyD2D(&slice, src6.data, 32 * sizeof(float)));
   return absl::OkStatus();
 }
 
@@ -934,52 +933,71 @@ XLA_FFI_DEFINE_HANDLER(kSubBuffers, SubBuffers,
                            .Arg<ffi::BufferBase>()  // src2
                            .Arg<ffi::BufferBase>()  // src3
                            .Arg<ffi::BufferBase>()  // src4
+                           .Arg<ffi::BufferBase>()  // src5
+                           .Arg<ffi::BufferBase>()  // src6
+                           .Arg<ffi::BufferBase>()  // src7
                            .Arg<ffi::BufferBase>()  // dst0
                            .Arg<ffi::BufferBase>()  // dst1
                            .Arg<ffi::BufferBase>()  // dst2
                            .Arg<ffi::BufferBase>()  // dst3
                            .Arg<ffi::BufferBase>()  // dst4
+                           .Arg<ffi::BufferBase>()  // dst5
+                           .Arg<ffi::BufferBase>()  // dst6
 );
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers",
                          PLATFORM, kSubBuffers);
 
 TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
   XlaBuilder b(TestName());
-  CustomCall(&b, "__xla_test$$subbuffers", /*operands=*/
-             {
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
-                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
-                       }),
-                 Tuple(&b,
-                       {
-                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
-                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
-                       }),
-                 Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
-                       {0, 0}, {4, 8}, {1, 1}),
-             },
-             ShapeUtil::MakeTupleShape({
-                 ShapeUtil::MakeShape(F32, {8}),
-                 ShapeUtil::MakeTupleShape({
-                     ShapeUtil::MakeShape(F32, {128}),
-                     ShapeUtil::MakeShape(F32, {256}),
-                 }),
-                 ShapeUtil::MakeShape(F32, {1024}),
-                 ShapeUtil::MakeShape(F32, {4, 8}),
-             }),
-             /*opaque=*/"",
-             /*has_side_effect=*/false,
-             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
-             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
-             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  CustomCall(
+      &b, "__xla_test$$subbuffers", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(&b,
+                {
+                    Tuple(&b,
+                          {
+                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
+                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
+                          }),
+                    Slice(Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}),
+                                    "p0"),
+                          {1, 0}, {4, 128}, {1, 1}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeShape(F32, {3, 128}),
+          ShapeUtil::MakeShape(F32, {32 + 64}),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
   xla::HloModuleConfig hlo_config(
       xla::ProgramShape(computation.proto().host_program_shape()),
-      /*ignore_layouts=*/false);
+      /*ignore_layouts=*/true);
   DebugOptions debug_options = GetDebugOptionsForTest();
   debug_options.set_xla_gpu_enable_address_computation_fusion(false);
   hlo_config.set_debug_options(debug_options);
@@ -991,9 +1009,12 @@ TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
-                                      error_spec,
-                                      /*run_hlo_passes=*/false));
+                                      error_spec, /*run_hlo_passes=*/false));
 }
 
 static absl::Status NoOp(se::Stream* stream, ffi::BufferBase operand) {
@@ -1037,6 +1058,10 @@ TEST_F(AddressComputationFusionTest, NilTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));
@@ -1046,7 +1071,7 @@ void Callback_Memcpy(se::gpu::GpuStreamHandle stream, void** buffers,
                      const char* /*opaque*/, size_t /*opaque_len*/) {
   void* src = buffers[0];
   void* dst = buffers[1];
-  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
+  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 3 * 128,
                             gpuMemcpyDeviceToDevice, stream);
   ASSERT_EQ(err, gpuSuccess);
 }
@@ -1057,9 +1082,9 @@ TEST_F(AddressComputationFusionTest, CustomCallLegacyAPI) {
   XlaBuilder b(TestName());
   CustomCall(&b, "Callback_Memcpy",
              /*operands=*/
-             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
-                    {128}, {1})},
-             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {512}), {128},
+                    {4 * 128}, {1})},
+             ShapeUtil::MakeShape(F32, {3 * 128}), /*opaque=*/"");
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
 
   TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
@@ -1077,6 +1102,10 @@ TEST_F(AddressComputationFusionTest, CustomCallLegacyAPI) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));
@@ -1111,6 +1140,1597 @@ TEST_F(AddressComputationFusionTest, NilTupleLegacyAPI) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
                                             computation.proto(), hlo_config));
 
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDynamic) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    ROOT custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  fused_computation {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] parameter(2)
+    c0_s32 = s32[] parameter(3)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    ROOT custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    ROOT fusion.2 = bf16[8,8]{1,0} fusion(p0, p1, c1_s32, c0_s32), kind=kCustom, calls=fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDynamicWithWorkspace) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[8,8]{1,0}, s8[256]{0}) tuple(%get-tuple-element.0, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicContiguousSlice) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    %slice.13 = bf16[1,4,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,4,8}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} dynamic-slice(%p1, %c1_s32, %c5_s32, %c2_s32, %c0_s32), dynamic_slice_sizes={1,1,8,8}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c2_s32 = s32[] parameter(4)
+    %c5_s32 = s32[] parameter(5)
+    %slice.13 = bf16[1,4,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,4,8}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} dynamic-slice(%p1, %c1_s32, %c5_s32, %c2_s32, %c0_s32), dynamic_slice_sizes={1,1,8,8}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1, %c1_s32, %c0_s32, %c2_s32, %c5_s32), kind=kCustom,
+    calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicContiguousSliceNonDefaultLayout) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    %slice.13 = bf16[1,8,4]{1,2,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,4}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} dynamic-slice(%p1, %c0_s32, %c0_s32, %c2_s32, %c5_s32), dynamic_slice_sizes={1,8,8,1}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0)
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1)
+    %c1_s32 = s32[] parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c2_s32 = s32[] parameter(4)
+    %c5_s32 = s32[] parameter(5)
+    %slice.13 = bf16[1,8,4]{1,2,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,4}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} dynamic-slice(%p1, %c0_s32, %c0_s32, %c2_s32, %c5_s32), dynamic_slice_sizes={1,8,8,1}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %c2_s32 = s32[] constant(2)
+    %c5_s32 = s32[] constant(5)
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1, %c1_s32, %c0_s32, %c2_s32, %c5_s32), kind=kCustom,
+    calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicOperandIsSlicedGetTupleElement) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %p1.3 = f32[100,100]{1,0} parameter(1)
+    %c0_s32 = s32[] parameter(2)
+    %slice.56 = f32[100,100]{1,0} dynamic-slice(%p0.3, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %p1.3),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240, %c0_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicReversedOperandOrder) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %c1_s32 = s32[] constant(1)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c0_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1.1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c0_s32 = s32[] parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %slice.1 = f16[1,8,8]{2,1,0} dynamic-slice(%p0.1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.1 = f16[8,8]{1,0} bitcast(%slice.1)
+    %slice.0 = f16[1,8,8]{2,1,0} dynamic-slice(%p1.1, %c0_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.0 = f16[8,8]{1,0} bitcast(%slice.0)
+    ROOT %custom-call.0 = f16[8,8]{1,0} custom-call(%bitcast.1, %bitcast.0),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  }
+
+  ENTRY %main {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %c0_s32 = s32[] constant(0)
+    %c1_s32 = s32[] constant(1)
+    ROOT %address_computation.6 = f16[8,8]{1,0} fusion(%p1, %p0, %c0_s32, %c1_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicSingleOperandComputation) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} dynamic-slice(%get-tuple-element.97, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %c0_s32 = s32[] parameter(1)
+    %slice.56 = f32[100,100]{1,0} dynamic-slice(%p0.3, %c0_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %slice.56),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %c0_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicSlicedOperandAliasingOutput) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+    ENTRY %main.9 {
+      %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+      %c20_s32 = s32[] constant(20)
+      %c99_s32 = s32[] constant(99)
+      %c0_s32 = s32[] constant(0)
+      %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+      %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+      %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+      %slice.30 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c20_s32, %c0_s32), dynamic_slice_sizes={100,100}
+      %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
+      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+        custom_call_target="__cublas$gemm",
+        output_to_operand_aliasing={{0}: (2, {})},
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f32[100,100]{1,0} parameter(0)
+    %p1 = f32[100,100]{1,0} parameter(1)
+    %p2 = f32[200,100]{1,0} parameter(2)
+    %c0_s32 = s32[] parameter(3)
+    %c20_s32 = s32[] parameter(4)
+    %slice.0 = f32[100,100]{1,0} dynamic-slice(f32[200,100]{1,0} %p2, %c20_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    %cublas-gemm.0 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%p0.1, %slice.0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.0), index=0
+    %get-tuple-element.1 = s8[120000]{0} get-tuple-element(%cublas-gemm.0), index=1
+    ROOT %tuple = (f32[100,100]{1,0}, s8[120000]{0}) tuple(%get-tuple-element, %get-tuple-element.1)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %c20_s32 = s32[] constant(20)
+    %c99_s32 = s32[] constant(99)
+    %c0_s32 = s32[] constant(0)
+    %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+    %slice.34 = f32[100,100]{1,0} dynamic-slice(%concatenate.12, %c99_s32, %c0_s32), dynamic_slice_sizes={100,100}
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12, %c0_s32, %c20_s32),
+      kind=kCustom,
+      calls=%address-computation,
+      output_to_operand_aliasing={{0}: (1, {})},
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUS) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    bitcast.43 = bf16[1,8,8]{2,1,0} bitcast(custom-call.1)
+    ROOT dus = bf16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  fused_computation {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] parameter(3)
+    c0_s32 = s32[] parameter(4)
+    slice.13 = bf16[1,8,8]{2,1,0} dynamic-slice(p0, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.41 = bf16[8,8]{1,0} bitcast(slice.13)
+    slice.14 = bf16[1,8,8]{2,1,0} dynamic-slice(p1, c1_s32, c0_s32, c0_s32), dynamic_slice_sizes={1,8,8}
+    bitcast.42 = bf16[8,8]{1,0} bitcast(slice.14)
+
+    custom-call.1 = bf16[8,8]{1,0} custom-call(bitcast.41, bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    bitcast.43 = bf16[1,8,8]{2,1,0} bitcast(custom-call.1)
+    ROOT dus = bf16[4,8,8]{2,1,0} dynamic-update-slice(p2, bitcast.43, c1_s32, c0_s32, c0_s32)
+  }
+
+  ENTRY main.9 {
+    p0 = bf16[2,8,8]{2,1,0} parameter(0)
+    p1 = bf16[2,8,8]{2,1,0} parameter(1)
+    p2 = bf16[4,8,8]{2,1,0} parameter(2)
+    c1_s32 = s32[] constant(1)
+    c0_s32 = s32[] constant(0)
+    ROOT fusion.2 = bf16[4,8,8]{2,1,0} fusion(p0, p1, p2, c1_s32, c0_s32), kind=kCustom, calls=fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  // The GEMM custom call does not have a workspace, shouldn't be run in command
+  // buffer.
+  EXPECT_TRUE(RunAndCompareTwoModules(
+      hlo_ref, hlo_opt, GetModuleConfigWithoutCommandBuffer(),
+      GetModuleConfigWithoutCommandBuffer(), error_spec,
+      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUSWithWorkspace) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUSWorkspaceIgnored) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%p0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    ROOT %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[8,8]{1,0} parameter(0)
+    %p1 = f16[8,8]{1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] constant(1)
+    %c0_s32 = s32[] constant(0)
+    ROOT %fusion.2 = f16[4,8,8]{2,1,0} fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetS32NotConstant) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s32[] parameter(3)
+    %c0_s32 = s32[] parameter(4)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CublasGemmDUSOffsetOOB) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] constant(10)
+    %c0_s32 = s64[] constant(-1)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] parameter(3)
+    %c0_s32 = s64[] parameter(4)
+    %slice.13 = f16[1,8,8]{2,1,0} dynamic-slice(%p0, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} dynamic-slice(%p1, %c1_s32, %c0_s32, %c0_s32), dynamic_slice_sizes={1,8,8}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %bitcast.43 = f16[1,8,8]{2,1,0} bitcast(%get-tuple-element.0)
+    %dus = f16[4,8,8]{2,1,0} dynamic-update-slice(%p2, %bitcast.43, %c1_s32, %c0_s32, %c0_s32)
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
+    ROOT %tuple = (f16[4,8,8]{2,1,0}, s8[256]{0}) tuple(%dus, %get-tuple-element.1)
+  }
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %p2 = f16[4,8,8]{2,1,0} parameter(2)
+    %c1_s32 = s64[] constant(10)
+    %c0_s32 = s64[] constant(-1)
+    ROOT %fusion.2 = (f16[4,8,8]{2,1,0}, s8[256]{0}) fusion(%p0, %p1, %p2, %c1_s32, %c0_s32), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"dynamic_address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicCustomCallSimple) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "__xla_test$$memcpy",
+      /*operands=*/
+      {DynamicSlice(Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}), "p0"),
+                    {Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}), "start0"),
+                     Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}), "start1")},
+                    {2, 128})},
+      ShapeUtil::MakeShape(F32, {2, 128}), /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, DynamicCustomCallWithTuple) {
+  XlaBuilder b(TestName());
+  CustomCall(
+      &b, "__xla_test$$subbuffers", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(&b,
+                {
+                    Tuple(&b,
+                          {
+                              Broadcast(ConstantR0WithType(&b, F32, 6), {32}),
+                              Broadcast(ConstantR0WithType(&b, F32, 7), {64}),
+                          }),
+                    DynamicSlice(
+                        Parameter(&b, 0, ShapeUtil::MakeShape(S32, {4, 128}),
+                                  "p0"),
+                        {Parameter(&b, 1, ShapeUtil::MakeShape(S32, {}),
+                                   "start0"),
+                         Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}),
+                                   "start1")},
+                        {3, 128}),
+                }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeShape(F32, {3, 128}),
+          ShapeUtil::MakeShape(F32, {32 + 64}),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/true);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec, /*run_hlo_passes=*/false));
+}
+
+static absl::Status SubBuffers2(se::Stream* stream, ffi::BufferBase src0,
+                                ffi::BufferBase src1, ffi::BufferBase src2,
+                                ffi::BufferBase src3, ffi::BufferBase src4,
+                                ffi::BufferBase src5, ffi::BufferBase src6,
+                                ffi::BufferBase dst0, ffi::BufferBase dst1,
+                                ffi::BufferBase dst2, ffi::BufferBase dst3,
+                                ffi::BufferBase dst4, ffi::BufferBase dst5,
+                                ffi::BufferBase dst6) {
+  //  src0:  param 0 at tuple index {0}, shape f32[128]
+  //  src1:  param 0 at tuple index {1}, shape f32[256]
+  //  src2:  param 1 at tuple index {0}, shape f32[1024]
+  //  src3:  param 1 at tuple index {1}, shape f32[8]
+  //  src4:  param 2, shape f32[4,8]
+  //  src5:  param 3 at tuple index {0, 0}, shape f32[3,128]
+  //  src6:  param 3 at tuple index {0, 1}, shape f32[5,128]
+  //
+  //  dst0:  result at tuple index {0}, shape f32[8]
+  //  dst1:  result at tuple index {1, 0}, shape f32[128]
+  //  dst2:  result at tuple index {1, 1}, shape f32[256]
+  //  dst3:  result at tuple index {2}, shape f32[1024]
+  //  dst4:  result at tuple index {3}, shape f32[4,8]
+  //  dst5:  result at tuple index {4, 0}, shape f32[5,128]
+  //  dst6:  result at tuple index {4, 1}, shape f32[3,128]
+
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst0.data, src3.data, 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst1.data, src0.data, 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst2.data, src1.data, 256 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst3.data, src2.data, 1024 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst4.data, src4.data, 4 * 8 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst5.data, src6.data, 5 * 128 * sizeof(float)));
+  TF_RETURN_IF_ERROR(
+      stream->MemcpyD2D(&dst6.data, src5.data, 3 * 128 * sizeof(float)));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kSubBuffers2, SubBuffers2,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::BufferBase>()  // src0
+                           .Arg<ffi::BufferBase>()  // src1
+                           .Arg<ffi::BufferBase>()  // src2
+                           .Arg<ffi::BufferBase>()  // src3
+                           .Arg<ffi::BufferBase>()  // src4
+                           .Arg<ffi::BufferBase>()  // src5
+                           .Arg<ffi::BufferBase>()  // src6
+                           .Arg<ffi::BufferBase>()  // dst0
+                           .Arg<ffi::BufferBase>()  // dst1
+                           .Arg<ffi::BufferBase>()  // dst2
+                           .Arg<ffi::BufferBase>()  // dst3
+                           .Arg<ffi::BufferBase>()  // dst4
+                           .Arg<ffi::BufferBase>()  // dst5
+                           .Arg<ffi::BufferBase>()  // dst6
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers2",
+                         PLATFORM, kSubBuffers2);
+
+TEST_F(AddressComputationFusionTest, CustomCallDUS) {
+  XlaBuilder b(TestName());
+  auto custom_call =
+      CustomCall(&b, "Callback_Memcpy",
+                 /*operands=*/
+                 {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {10, 128}),
+                        {2, 0}, {5, 128}, {1, 1})},
+                 ShapeUtil::MakeShape(F32, {3, 128}), /*opaque=*/"");
+
+  DynamicUpdateSlice(
+      Broadcast(ConstantR0WithType(&b, F32, 92.0), {10, 128}), custom_call,
+      {ConstantR0WithType(&b, S32, 4), ConstantR0WithType(&b, S32, 0)});
+
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, CustomCallDUSTuple) {
+  XlaBuilder b(TestName());
+  auto big_buffer1 =
+      Parameter(&b, 0, ShapeUtil::MakeShape(F32, {10, 128}), "p0");
+  auto big_buffer2 =
+      Parameter(&b, 1, ShapeUtil::MakeShape(F32, {10, 256}), "p1");
+  auto custom_call = CustomCall(
+      &b, "__xla_test$$subbuffers2", /*operands=*/
+      {
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                    Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                }),
+          Tuple(&b,
+                {
+                    Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                    Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                }),
+          Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}), {0, 0},
+                {4, 8}, {1, 1}),
+          Tuple(
+              &b,
+              {
+                  Tuple(
+                      &b,
+                      {
+                          Broadcast(ConstantR0WithType(&b, F32, 6), {3, 128}),
+                          DynamicSlice(Broadcast(ConstantR0WithType(&b, F32, 7),
+                                                 {8, 128}),
+                                       {ConstantR0WithType(&b, S32, 2),
+                                        ConstantR0WithType(&b, S32, 0)},
+                                       {5, 128}),
+                      }),
+              }),
+      },
+      ShapeUtil::MakeTupleShape({
+          ShapeUtil::MakeShape(F32, {8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {128}),
+              ShapeUtil::MakeShape(F32, {256}),
+          }),
+          ShapeUtil::MakeShape(F32, {1024}),
+          ShapeUtil::MakeShape(F32, {4, 8}),
+          ShapeUtil::MakeTupleShape({
+              ShapeUtil::MakeShape(F32, {5, 128}),
+              ShapeUtil::MakeShape(F32, {3, 128}),
+          }),
+      }),
+      /*opaque=*/"",
+      /*has_side_effect=*/false,
+      /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+      /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+      /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  auto tuple_gte = GetTupleElement(custom_call, 4);
+  auto dus1 = DynamicUpdateSlice(
+      big_buffer1, GetTupleElement(tuple_gte, 0),
+      {ConstantR0WithType(&b, S32, 2), ConstantR0WithType(&b, S32, 0)});
+  auto dus2 = DynamicUpdateSlice(
+      big_buffer1, GetTupleElement(tuple_gte, 1),
+      {ConstantR0WithType(&b, S32, 7), ConstantR0WithType(&b, S32, 0)});
+  auto dus3 = DynamicUpdateSlice(
+      big_buffer2,
+      xla::internal::XlaBuilderFriend::BuildBitcast(
+          &b, GetTupleElement(custom_call, 2),
+          ShapeUtil::MakeShape(F32, {4, 256})),
+      {Parameter(&b, 2, ShapeUtil::MakeShape(S32, {}), "start0"),
+       Parameter(&b, 3, ShapeUtil::MakeShape(S32, {}), "start1")});
+  Tuple(&b, {dus1, dus2, dus3});
+
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  AddressComputationFusionRewriter pass(PLATFORM);
+  TF_ASSERT_OK_AND_ASSIGN(auto changed, this->RunHloPass(&pass, hlo_opt.get()));
+  EXPECT_TRUE(changed);
+
   EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
                                       error_spec,
                                       /*run_hlo_passes=*/false));
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index 084aece24b1c92..f0f2eee6e16f18 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -58,8 +58,16 @@ ConcatenateFusion::ConcatenateFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis) {}
 
 std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
-  return std::nullopt;  // TODO(b/319081342): Implement this.
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  return std::nullopt;
+}
+
+std::optional<IndexingMap> ConcatenateFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  return GetDefaultThreadIdIndexingMap(launch_dimensions(), /*unroll_factor=*/1,
+                                       GetLargestConcatOperandShape(analysis_),
+                                       ctx);
 }
 
 absl::Status ConcatenateFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index 997033293eff2b..ec9349c9589410 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -38,14 +38,11 @@ class ConcatenateFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
index 638c8ec9436c25..e6091085c2e2c5 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.cc
@@ -51,14 +51,6 @@ using llvm::SmallVector;
 using mlir::Value;
 using mlir::ValueRange;
 
-/*static*/ bool MlirConcatenateFusion::IsSupported(
-    const HloFusionAnalysis& analysis) {
-  if (analysis.fusion_roots().size() != 1) return false;
-
-  return mlir_converter::IsHloConversionSupported(
-      analysis.fusion(), analysis.device_info().gpu_compute_capability());
-}
-
 LaunchDimensions MlirConcatenateFusion::launch_dimensions() const {
   return CalculateLaunchDimensions(GetLargestConcatOperandShape(analysis_),
                                    analysis_.device_info());
@@ -74,9 +66,10 @@ std::optional<IndexingMap>
 MlirConcatenateFusion::ComputeThreadIdToInputIndexing(
     int64_t root_index, int64_t hero_operand_index,
     mlir::MLIRContext* ctx) const {
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dimensions(), /*unroll_factor=*/1,
-      GetLargestConcatOperandShape(analysis_), ctx);
+  // TODO(b/331356433): Add constraints depending on the `hero_operand_index`.
+  return GetDefaultThreadIdIndexingMap(launch_dimensions(), /*unroll_factor=*/1,
+                                       GetLargestConcatOperandShape(analysis_),
+                                       ctx);
 }
 
 std::vector<const HloInstruction*>
@@ -90,7 +83,6 @@ absl::Status MlirConcatenateFusion::EmitEntryFunction(
     const mlir_converter::CallTargetProvider& call_targets,
     mlir::func::FuncOp entry_function,
     const HloFusionInstruction& fusion) const {
-  CHECK(IsSupported(analysis_));
   const auto& root_computation = computations.FindPartitionedComputation(
       fusion.fused_instructions_computation());
   const auto* concat = analysis_.fusion_heroes()[0];
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
index 5003046bf39e41..f14606a1073f47 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir.h
@@ -37,8 +37,6 @@ class MlirConcatenateFusion : public MlirFusionEmitterBase {
   explicit MlirConcatenateFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis) {}
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
-
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
index 3b8040caad0d2e..515d385408018f 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_mlir_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/error_spec.h"
+#include "xla/service/gpu/fusions/concatenate.h"
 #include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,6 +28,59 @@ namespace {
 
 using MlirConcatenateFusionTest = MlirEmitterTestBase<MlirConcatenateFusion>;
 
+TEST_F(MlirConcatenateFusionTest, ThreadIdIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT concat = f32[900] concatenate(param0, param1, param2), dimensions={0}
+    }
+    ENTRY main {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT fusion = f32[900] fusion(param0, param1, param2),
+        calls=fused_computation, kind=kLoop
+    }
+  )"));
+  thread_id_printer_.SetSymbolName(0, "chunk_id");
+  thread_id_printer_.SetSymbolName(1, "unroll_id");
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirConcatenateFusion fusion(analysis);
+
+  constexpr auto kIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    (th_x + bl_x * 128) mod 400)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 3]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 399]
+  )";
+  auto thread_id_to_output_indexing_0 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_0->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+  auto thread_id_to_output_indexing_1 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_1->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+  auto thread_id_to_output_indexing_2 = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing_2->ToString(thread_id_printer_),
+              MatchIndexingString(kIndexing));
+}
+
 TEST_F(MlirConcatenateFusionTest, StandAloneConcatenate) {
   auto kHloString = R"(
     HloModule module
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
new file mode 100644
index 00000000000000..b617fd6513d107
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/concatenate.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class ConcatenateTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(ConcatenateTest, ThreadIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT concat = f32[900] concatenate(param0, param1, param2), dimensions={0}
+    }
+    ENTRY main {
+      param0 = f32[200] parameter(0)
+      param1 = f32[400] parameter(1)
+      param2 = f32[300] parameter(2)
+      ROOT fusion = f32[900] fusion(param0, param1, param2),
+        calls=fused_computation, kind=kLoop
+    }
+  )")
+                    .value();
+
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<ConcatenateFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  constexpr auto kIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    (th_x + bl_x * 128) mod 400)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 3]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndexing));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.cc b/third_party/xla/xla/service/gpu/fusions/copy.cc
index eaa7a515cf8e8f..37cb5ab0dba0e0 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.cc
+++ b/third_party/xla/xla/service/gpu/fusions/copy.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/runtime/copy_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn.cc b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
index 010bba895d1172..60f8af52222eb6 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/cudnn.h"
 
-#include "absl/strings/escaping.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #if GOOGLE_CUDA
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
@@ -30,30 +29,13 @@ absl::StatusOr<FusionEmissionResult> CuDnnFusion::Emit(
 #if GOOGLE_CUDA
   VLOG(3) << fusion.ToString();
 
-  TF_ASSIGN_OR_RETURN(auto gpu_config,
-                      fusion.backend_config<GpuBackendConfig>());
-  if (!gpu_config.fusion_backend_config().has_cudnn_fusion_config() ||
-      !gpu_config.fusion_backend_config()
-           .cudnn_fusion_config()
-           .has_serialized_graph()) {
-    return absl::FailedPreconditionError("cuDNN fusion is not compiled.");
-  }
-  std::string unescaped;
-  std::string error;
-  if (!absl::CUnescape(gpu_config.fusion_backend_config()
-                           .cudnn_fusion_config()
-                           .serialized_graph(),
-                       &unescaped, &error)) {
-    return absl::UnknownError(
-        absl::StrCat("Failed unescaping string: ", error));
-  }
-
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
       KernelArguments::Create(ir_emitter_context.buffer_assignment(), &fusion));
   FusionEmissionResult result;
   result.thunks.emplace_back(std::make_unique<CuDnnThunk>(
-      std::move(unescaped), Thunk::ThunkInfo::WithProfileAnnotation(&fusion),
+      GetComputationFingerprint(fusion.fused_instructions_computation(), {}),
+      Thunk::ThunkInfo::WithProfileAnnotation(&fusion),
       kernel_arguments.args()));
   return result;
 #else
diff --git a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
index 2d69800a63b69a..acd36195bed160 100644
--- a/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/cudnn_test.cc
@@ -13,9 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/substitute.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/executable.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/tests/filecheck.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -28,6 +38,7 @@ class CuDnnFusionTest : public GpuCodegenTest {
     // Let this group of tests just use first available plan skipping
     // autotuning.
     debug_options.set_xla_gpu_autotune_level(0);
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(1);
     return debug_options;
   }
   bool IsAtLeastHopperWithCuDnn9() {
@@ -37,6 +48,12 @@ class CuDnnFusionTest : public GpuCodegenTest {
                .IsAtLeastHopper() &&
            GetDnnVersionInfo(executor).major_version() >= 9;
   }
+  bool IsAtLeastCuDnn91() {
+    se::StreamExecutor* executor = backend().default_stream_executor();
+    const se::dnn::VersionInfo version = GetDnnVersionInfo(executor);
+    return (version.major_version() == 9 && version.minor_version() >= 1) ||
+           version.major_version() > 9;
+  }
 
  protected:
   void SetUp() override {
@@ -250,47 +267,440 @@ ENTRY e {
                             ErrorSpec{/*aabs=*/1e-5, /*arel=*/1e-5}));
 }
 
-TEST_F(CuDnnFusionExecutionTest, CommandBuffersAreSupported) {
+class CuDnnFusionCommandBufferTest : public CuDnnFusionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = CuDnnFusionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_graph_min_graph_size(1);
+    return debug_options;
+  }
+};
+
+TEST_F(CuDnnFusionCommandBufferTest, CommandBuffersAreSupported) {
   const std::string kHloText = R"(
-HloModule m
+fd0 {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  ROOT d = f32[64,64]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
 
-%fusion0 {
-  %p0 = f32[64,64]{1,0} parameter(0)
-  %p1 = f32[64,64]{1,0} parameter(1)
-  ROOT %d = f32[64,64]{1,0} dot(%p0, %p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+fd1 {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  ROOT d = f32[64,64]{1,0} dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 }
 
-%fusion_a {
-  %p0.2 = f32[64,64]{1,0} parameter(0)
-  %p1.2 = f32[64,64]{1,0} parameter(1)
-  ROOT %a = f32[64,64]{1,0} add(%p0.2, %p1.2)
+ENTRY e {
+  p0 = f32[64,64]{1,0} parameter(0)
+  p1 = f32[64,64]{1,0} parameter(1)
+  d0 = f32[64,64]{1,0} fusion(p0, p1), kind=kCustom, calls=fd0,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
+  a = f32[64,64]{1,0} add(d0, d0)
+  ROOT d1 = f32[64,64]{1,0} fusion(a, d0), kind=kCustom, calls=fd1,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
+})";
+
+  // Verify that a command buffer is applied.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Executable> executable,
+      backend().compiler()->RunBackend(
+          GetOptimizedModule(kHloText).value(),
+          backend().default_stream_executor(),
+          backend().default_stream_executor()->GetAllocator()));
+  absl::StatusOr<bool> filecheck_result =
+      RunFileCheck(executable->module().ToString(), R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: ROOT
+; CHECK-SAME: command_buffer
+)");
+  TF_ASSERT_OK(filecheck_result.status());
+  EXPECT_TRUE(filecheck_result.value());
+
+  // Verify that the command buffer executes correctly.
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-%fusion1 {
-  %p0.1 = f32[64,64]{1,0} parameter(0)
-  %p1.1 = f32[64,64]{1,0} parameter(1)
-  ROOT %d.1 = f32[64,64]{1,0} dot(%p0.1, %p1.1), lhs_contracting_dims={0}, rhs_contracting_dims={1}
+class CuDnnFusionLevel2Test : public CuDnnFusionExecutionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
+    return debug_options;
+  }
+};
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim2ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,32] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={0,1}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
 }
 
-%command_buffer {
-  %p0.4 = f32[64,64]{1,0} parameter(0)
-  %p1.4 = f32[64,64]{1,0} parameter(1)
-  %d0.1 = f32[64,64]{1,0} fusion(%p0.4, %p1.4), kind=kCustom, calls=%fusion0,
-    backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
-  %a.2 = f32[64,64]{1,0} fusion(%d0.1, %d0.1), kind=kLoop, calls=%fusion_a
-  ROOT %d1.1 = f32[64,64]{1,0} fusion(%a.2, %p1.4), kind=kCustom, calls=%fusion1,
-    backend_config={"fusion_backend_config":{"kind":"__cudnn$fusion","cudnn_fusion_config":{"plan_id":"0"}}}
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,32] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim1ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,128] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={0,2}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[16,128] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastToDim0ExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = bf16[32,128] parameter(0)
+  p0b = bf16[5,32,128] broadcast(p0), dimensions={1,2}
+  p1 = bf16[5,128,64] parameter(1)
+  ROOT r = f32[5,32,64] dot(p0b, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = bf16[32,128] parameter(0)
+  p1 = bf16[5,128,64] parameter(1)
+  ROOT _ = f32[5,32,64] fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel2Test, BroadcastTo2DimsExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[128] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={2}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[128] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-ENTRY %e {
-  %p0.3 = f32[64,64]{1,0} parameter(0)
-  %p1.3 = f32[64,64]{1,0} parameter(1)
-  ROOT %call = f32[64,64]{1,0} call(%p0.3, %p1.3), to_apply=%command_buffer
+TEST_F(CuDnnFusionLevel2Test, BroadcastTo3DimsExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[] parameter(2)
+  p2b = f16[16,32,128] broadcast(p2), dimensions={}
+  a = f16[16,32,128] add(p0, p2b)
+  ROOT r = f16[16,32,64] dot(a, p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f16[16,32,128] parameter(0)
+  p1 = f16[16,128,64] parameter(1)
+  p2 = f16[] parameter(2)
+  ROOT _ = f16[16,32,64] fusion(p0, p1, p2), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+class CuDnnFusionLevel3Test : public CuDnnFusionExecutionTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options =
+        CuDnnFusionExecutionTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_cudnn_gemm_fusion_level(3);
+    return debug_options;
+  }
+};
+
+TEST_F(CuDnnFusionLevel3Test,
+       DotWithSplitNonContractingInputExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  cp0 = s8[4,3,16,400]{3,2,1,0} copy(p0)
+  bc0 = s8[192,400]{1,0} bitcast(cp0)
+  cvt0 = bf16[192,400]{1,0} convert(bc0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  bc1 = bf16[128,400]{1,0} reshape(p1)
+  ROOT d = bf16[192,128]{1,0} dot(cvt0, bc1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY r {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  ROOT r = bf16[192,128]{1,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(CuDnnFusionLevel3Test,
+       DotWithSplitNonContractingInOutExecutesCorrectly) {
+  EXPECT_TRUE(RunAndCompare(R"(
+fusion1 {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  cp0 = s8[4,3,16,400]{3,2,1,0} copy(p0)
+  bc0 = s8[192,400]{1,0} bitcast(cp0)
+  cvt0 = bf16[192,400]{1,0} convert(bc0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  bc1 = bf16[128,400]{1,0} reshape(p1)
+  d = bf16[192,128]{1,0} dot(cvt0, bc1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bc = bf16[4,3,16,128]{3,2,1,0} bitcast(d)
+  ROOT cp = bf16[4,3,16,128]{2,1,3,0} copy(bc)
+}
+
+ENTRY r {
+  p0 = s8[4,3,16,400]{2,1,3,0} parameter(0)
+  p1 = bf16[1,128,400]{2,1,0} parameter(1)
+  ROOT r = bf16[4,3,16,128]{2,1,3,0} fusion(p0, p1), kind=kCustom, calls=fusion1,
+    backend_config={"fusion_backend_config": {kind: "__cudnn$fusion"}}
+})",
+                            ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+class ElementwiseTest : public CuDnnFusionExecutionTest,
+                        public ::testing::WithParamInterface<
+                            std::tuple<PrimitiveType, HloOpcode, float>> {};
+
+std::string ElementwiseTestParamsToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode, float>>&
+        data) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = data.param;
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
+      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
+}
+
+using UnaryElementwiseTest = ElementwiseTest;
+
+TEST_P(UnaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  f1.1 = $0[32,32] $1(p1)
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p1 = $0[32,32] parameter(1)
+  p0 = f32[32,32] parameter(0)
+  ROOT r = f32[32,32] fusion(p0, p1), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
 })";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
 
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_test,
+                            ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F32),
+                       ::testing::ValuesIn({HloOpcode::kAbs, HloOpcode::kCos,
+                                            HloOpcode::kExp, HloOpcode::kLog,
+                                            HloOpcode::kNegate,
+                                            HloOpcode::kRsqrt, HloOpcode::kSin,
+                                            HloOpcode::kSqrt, HloOpcode::kTan,
+                                            HloOpcode::kTanh}),
+                       ::testing::Values(5e-4)),
+    ElementwiseTestParamsToString);
+
+using BinaryElementwiseTest = ElementwiseTest;
+
+TEST_P(BinaryElementwiseTest, ElementwiseFusionExecutesCorrectly) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  float tolerance;
+  std::tie(data_type, opcode, tolerance) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  f1.1 = $0[32,32] $1(p1, p2)
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  ROOT r = f32[32,32] fusion(p0, p1, p2), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test,
+                            ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ElementwiseTestSuiteF32, BinaryElementwiseTest,
+    ::testing::Combine(
+        ::testing::Values(F32),
+        ::testing::ValuesIn({HloOpcode::kAdd, HloOpcode::kDivide,
+                             HloOpcode::kMaximum, HloOpcode::kMinimum,
+                             HloOpcode::kMultiply, HloOpcode::kPower,
+                             HloOpcode::kSubtract}),
+        ::testing::Values(3e-3)),
+    ElementwiseTestParamsToString);
+
+class CompareTest : public CuDnnFusionExecutionTest,
+                    public ::testing::WithParamInterface<
+                        std::tuple<PrimitiveType, Comparison::Direction>> {};
+
+std::string CompareTestParamsToString(
+    const ::testing::TestParamInfo<
+        std::tuple<PrimitiveType, Comparison::Direction>>& data) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = data.param;
+  return absl::StrCat(primitive_util::LowercasePrimitiveTypeName(data_type),
+                      "_", ComparisonDirectionToString(direction));
+}
+
+TEST_P(CompareTest, FusedComparisonExecutesCorrectly) {
+  PrimitiveType data_type;
+  Comparison::Direction direction;
+  std::tie(data_type, direction) = GetParam();
+
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  f1.1 = pred[32,32] compare(p1, p2), direction=$1
+  c.1 = f32[32,32] convert(f1.1)
+  ROOT _ = f32[32,32] dot(p0, c.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  ROOT r = f32[32,32] fusion(p0, p1, p2), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      ComparisonDirectionToString(direction));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+using cd = Comparison::Direction;
+
+INSTANTIATE_TEST_SUITE_P(
+    CompareTestSuite, CompareTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S32, F16, F32),
+                       ::testing::Values(cd::kEq, cd::kNe, cd::kGe, cd::kGt,
+                                         cd::kLe, cd::kLt)),
+    CompareTestParamsToString);
+
+class SelectTest : public CuDnnFusionExecutionTest,
+                   public ::testing::WithParamInterface<PrimitiveType> {};
+
+TEST_P(SelectTest, SelectFusionExecutesCorrectly) {
+  if (!IsAtLeastCuDnn91()) {
+    GTEST_SKIP() << "Select operation requires cuDNN 9.1+.";
+  }
+  const std::string kHloTemplate = R"(
+fusion_computation {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  p3 = pred[32,32] parameter(3)
+  s = $0[32,32] select(p3, p1, p2)
+  c = f32[32,32] convert(s)
+  ROOT r = f32[32,32] dot(p0, c),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[32,32] parameter(0)
+  p1 = $0[32,32] parameter(1)
+  p2 = $0[32,32] parameter(2)
+  p3 = pred[32,32] parameter(3)
+  ROOT r = f32[32,32] fusion(p0, p1, p2, p3), kind=kCustom,
+    calls=fusion_computation,
+    backend_config={"fusion_backend_config":{"kind":"__cudnn$$fusion"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
+
+  EXPECT_TRUE(RunAndCompare(hlo_test, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
+}
+
+constexpr std::array<PrimitiveType, 3> kSupportedDataTypes{F16, F32, BF16};
+
+INSTANTIATE_TEST_SUITE_P(SelectTestSuite, SelectTest,
+                         ::testing::ValuesIn(kSupportedDataTypes));
+
 class CuDnnFusionRewriteTest : public CuDnnFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index fa910bc2589cf6..99b1e9a0918669 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
@@ -53,10 +55,11 @@ limitations under the License.
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/runtime/address_computation_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -68,6 +71,9 @@ namespace xla {
 namespace gpu {
 namespace {
 
+constexpr unsigned kGEMMOutputBufferIndex = 0;
+constexpr unsigned kGEMMWorkspaceBufferIndex = 1;
+
 absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     CustomKernel custom_kernel) {
@@ -79,109 +85,166 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       &fusion, std::move(custom_kernel), std::move(kernel_arguments.args()));
 }
 
-// TODO(vuson): this is duplicated from ir_emitter_unnested.cc
-// Converts MLIR dictionary attribute attached to a custom call operation to a
-// custom call thunk attributes that are forwarded to the FFI handler.
-static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CustomCallThunk::AttributesMap attributes;
-  for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
-
-    auto integer = [&](mlir::IntegerAttr integer) {
-      switch (integer.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<int32_t>(integer.getInt());
-          return absl::OkStatus();
-        case 64:
-          attributes[name] = static_cast<int64_t>(integer.getInt());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported integer attribute bit width for attribute: ", name));
-      }
-    };
+absl::StatusOr<BufferAllocation::Slice> GetOperandSlice(
+    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& adaptor,
+    const HloInstruction& fusion_instr, const HloInstruction& start_instr,
+    std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
+    unsigned arg_idx) {
+  if (const auto* param = DynCast<HloParameterInstruction>(&start_instr)) {
+    return GetAllocationSlice(buffer_assignment,
+                              fusion_instr.operand(param->parameter_number()),
+                              shape_idx);
+  }
 
-    auto fp = [&](mlir::FloatAttr fp) {
-      switch (fp.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported float attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto str = [&](mlir::StringAttr str) {
-      attributes[name] = str.getValue().str();
-      return absl::OkStatus();
-    };
-
-    TF_RETURN_IF_ERROR(
-        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
-            .Case<mlir::IntegerAttr>(integer)
-            .Case<mlir::FloatAttr>(fp)
-            .Case<mlir::StringAttr>(str)
-            .Default([&](mlir::Attribute) {
-              return absl::InvalidArgumentError(absl::StrCat(
-                  "Unsupported attribute type for attribute: ", name));
-            }));
+  // Walk through ShapeIndex to find the real starting point.
+  auto* start = const_cast<HloInstruction*>(&start_instr);
+  for (auto idx : shape_idx) {
+    CHECK(start->shape().IsTuple());
+    start = const_cast<HloInstruction*>(start->operand(idx));
   }
-  return attributes;
-}
 
-absl::StatusOr<BufferAllocation::Slice> GetSliceWithUpdatedOffsetAndSize(
-    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& fusion,
-    const HloInstruction& fusion_instr, const HloInstruction& start,
-    const ShapeIndex& index) {
-  if (const auto* param = DynCast<HloParameterInstruction>(&start)) {
+  if (const auto* param = DynCast<HloParameterInstruction>(start)) {
+    // At this point we've walked through all `shape_idx`, `index` should be
+    // empty.
     return GetAllocationSlice(buffer_assignment,
                               fusion_instr.operand(param->parameter_number()),
-                              index);
+                              /*index*/ {});
   }
 
   auto slice_adaptor =
-      HloFindIf({HloInstructionAdaptor(start)}, fusion,
-                [](auto node) { return node.opcode() == HloOpcode::kSlice; });
-  if (!slice_adaptor.has_value()) {
-    return absl::InternalError(
-        "AddressComputationFusion expects at least one sliced operand");
+      HloFindIf({HloInstructionAdaptor(*start)}, adaptor, [](auto node) {
+        return IsOpcodeAnyOf<HloOpcode::kDynamicSlice, HloOpcode::kSlice>(node);
+      });
+  if (slice_adaptor.has_value()) {
+    auto* slice_instr =
+        const_cast<HloInstruction*>(&slice_adaptor->instruction());
+
+    if (!IsContiguousSlice(slice_instr->operand(0)->shape(),
+                           slice_instr->shape())) {
+      return absl::InternalError(
+          "DynamicAddressComputationFusion only handles contiguous slices "
+          "currently");
+    }
+
+    slice_instrs[arg_idx] = slice_instr;
+
+    const auto* param = Cast<HloParameterInstruction>(slice_instr->operand(0));
+    // At this point we've walked through all `shape_idx`, `index` should be
+    // empty.
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice orig_slice,
+        GetAllocationSlice(buffer_assignment,
+                           fusion_instr.operand(param->parameter_number()),
+                           /*index*/ {}));
+
+    if (auto* static_slice = DynCast<HloSliceInstruction>(slice_instr)) {
+      // Update static slices.
+      const Shape& src_shape = static_slice->operand(0)->shape();
+      const Shape& dst_shape = static_slice->shape();
+      int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
+
+      // Given this slice
+      // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
+      //                         slice={[1:2], [4:8], [0:8]}
+      //
+      // The offset of the slice should be:
+      //    slice_starts(0) * 8 * 8 * sizeof(f16) +
+      //    slice_starts(1) * 8 * sizeof(f16)
+      int64_t offset = orig_slice.offset();
+      for (auto [start, stride] :
+           llvm::zip(static_slice->slice_starts(),
+                     *ShapeUtil::ByteStrides(src_shape))) {
+        offset += start * stride;
+      }
+
+      return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
+    }
+
+    return orig_slice;
   }
 
-  const auto& slice_instr =
-      *static_cast<const HloSliceInstruction*>(&slice_adaptor->instruction());
+  return absl::InternalError("WTF");
+}
 
-  if (!IsContiguousSlice(slice_instr)) {
-    return absl::InternalError(
-        "AddressComputationFusion only handles contiguous slices currently");
+absl::Status CollectSliceInfo(
+    const BufferAssignment& buffer_assignment,
+    const HloInstruction& fusion_instr,
+    absl::Span<HloInstruction*> slice_instrs,
+    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>&
+        offset_buffer_indices,
+    std::vector<std::optional<Shape>>& orig_shapes,
+    std::vector<std::optional<Shape>>& sliced_shapes,
+    std::vector<std::optional<uint64_t>>& offset_byte_sizes, unsigned arg_idx) {
+  auto* slice_instr =
+      DynCastOrNull<HloDynamicIndexInstruction>(slice_instrs[arg_idx]);
+  if (slice_instr == nullptr) {
+    return absl::OkStatus();
+  }
+
+  std::vector<BufferAllocation::Slice> offset_slices;
+  for (auto idx_op : slice_instr->index_operands()) {
+    const auto* param = Cast<HloParameterInstruction>(idx_op);
+    TF_ASSIGN_OR_RETURN(
+        auto offset_slice,
+        GetAllocationSlice(buffer_assignment,
+                           fusion_instr.operand(param->parameter_number()),
+                           /*index=*/{}));
+    offset_slices.push_back(offset_slice);
   }
+  offset_buffer_indices[arg_idx] = std::move(offset_slices);
+  orig_shapes[arg_idx] = slice_instr->operand(0)->shape();
+  sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(slice_instr)
+                               ? slice_instr->shape()
+                               : slice_instr->operand(1)->shape();
+  offset_byte_sizes[arg_idx] = ShapeUtil::ByteSizeOfPrimitiveType(
+      slice_instr->index_operands().front()->shape().element_type());
+
+  return absl::OkStatus();
+}
 
-  const Shape& src_shape = slice_instr.operand(0)->shape();
-  const Shape& dst_shape = slice_instr.shape();
-  int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
+absl::StatusOr<BufferAllocation::Slice> GetResultSlice(
+    const BufferAssignment& buffer_assignment, const HloFusionAdaptor& adaptor,
+    const HloInstruction& fusion_instr, const HloInstruction& start_instr,
+    std::vector<HloInstruction*>& slice_instrs, const ShapeIndex& shape_idx,
+    unsigned arg_idx) {
+  auto* start = const_cast<HloInstruction*>(&start_instr);
+  // Walk through ShapeIndex to find the real "user" (i.e. not get-tuple-element
+  // user). Otherwise one sliced element will mark all buffers of all other
+  // elements "sliced" too.
+  if (start->shape().IsTuple()) {
+    for (auto idx : shape_idx) {
+      std::vector<HloGetTupleElementInstruction*> gte_users(
+          start->shape().tuple_shapes_size(), nullptr);
+      for (auto* user : start->users())
+        if (auto* gte = DynCast<HloGetTupleElementInstruction>(user))
+          gte_users[gte->tuple_index()] = gte;
+
+      start = static_cast<HloInstruction*>(gte_users[idx]);
+      if (start == nullptr)
+        return GetAllocationSlice(buffer_assignment, &fusion_instr, shape_idx);
+    }
+  }
 
-  const auto* param = Cast<HloParameterInstruction>(slice_instr.operand(0));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice orig_slice,
-      GetAllocationSlice(buffer_assignment,
-                         fusion_instr.operand(param->parameter_number()),
-                         index));
-
-  // Given this slice
-  // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
-  //                         slice={[1:2], [4:8], [0:8]}
-  //
-  // The offset of the slice should be:
-  //    slice_starts(0) * 8 * 8 * sizeof(f16) +
-  //    slice_starts(1) * 8 * sizeof(f16)
-  int64_t offset = orig_slice.offset();
-  for (auto [start, stride] : llvm::zip(slice_instr.slice_starts(),
-                                        *ShapeUtil::ByteStrides(src_shape))) {
-    offset += start * stride;
+  auto slice_adaptor = HloFindIf(
+      {HloInstructionAdaptor(*start)}, adaptor,
+      [](auto node) { return node.opcode() == HloOpcode::kDynamicUpdateSlice; },
+      /*visit_operands=*/false);
+  if (slice_adaptor.has_value()) {
+    auto* slice_instr =
+        const_cast<HloInstruction*>(&slice_adaptor->instruction());
+    slice_instrs[arg_idx] = slice_instr;
+
+    if (!IsContiguousSlice(slice_instr->shape(),
+                           Cast<HloDynamicUpdateSliceInstruction>(slice_instr)
+                               ->update()
+                               ->shape())) {
+      return absl::InternalError(
+          "DynamicAddressComputationFusion only handles contiguous slices "
+          "currently");
+    }
   }
 
-  return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
+  return GetAllocationSlice(buffer_assignment, &fusion_instr, shape_idx);
 }
 
 absl::StatusOr<FusionEmissionResult> EmitGemm(
@@ -191,30 +254,82 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice lhs_slice,
-      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(0), /*index=*/{}));
-
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice rhs_slice,
-      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
-                                       *custom_call.operand(1), /*index=*/{}));
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      offset_buffer_indices(4, std::nullopt);
+  std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
+  std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
+  std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
+
+  std::vector<HloInstruction*> slice_instrs(4, nullptr);
+
+  unsigned arg_idx = 0;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
+                      GetOperandSlice(buffer_assignment, adaptor, fusion,
+                                      *custom_call.operand(arg_idx),
+                                      slice_instrs, /*shape_idx=*/{}, arg_idx));
+  TF_RETURN_IF_ERROR(CollectSliceInfo(
+      buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      arg_idx++));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
+                      GetOperandSlice(buffer_assignment, adaptor, fusion,
+                                      *custom_call.operand(arg_idx),
+                                      slice_instrs, /*shape_idx=*/{}, arg_idx));
+  TF_RETURN_IF_ERROR(CollectSliceInfo(
+      buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      arg_idx++));
 
   BufferAllocation::Slice output;
-  std::optional<BufferAllocation::Slice> workspace;
-
-  // Result of a legacy cuBLAS custom call can be a tuple if we explicitly
-  // allocate workspace buffer in HLO. If result is an array, it means that
-  // workspace is not available, and cuBLAS will allocate its own workspace.
-  if (custom_call.shape().IsArray()) {
-    TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {}));
+  std::optional<BufferAllocation::Slice> workspace = std::nullopt;
+  std::optional<BufferAllocation::Slice> slice_workspace_fake = std::nullopt;
+
+  // Handling cases where multiple operands share the same buffer, with
+  // different offset by creating new fake allocations so each operand will have
+  // a different buffer index. The slices can thus always start at offset 0.
+  // AddressComputationThunk will take care of the offset adjustment.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+  if (fusion.shape().IsArray()) {
+    TF_ASSIGN_OR_RETURN(
+        output, GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                               slice_instrs, /*shape_idx=*/{}, arg_idx));
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx));
   } else {
-    TF_ASSIGN_OR_RETURN(output,
-                        GetAllocationSlice(buffer_assignment, &fusion, {0}));
-    TF_ASSIGN_OR_RETURN(workspace,
-                        GetAllocationSlice(buffer_assignment, &fusion, {1}));
+    TF_ASSIGN_OR_RETURN(
+        output,
+        GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                       slice_instrs, /*shape_idx=*/{kGEMMOutputBufferIndex},
+                       arg_idx));
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx++));
+
+    // TODO(vuson): If we want to support slices of workspace, we'd need to
+    // start `HloFindIf` with `get-tuple-element` with the right index.
+    TF_ASSIGN_OR_RETURN(
+        workspace, GetAllocationSlice(buffer_assignment, &fusion,
+                                      /*index=*/{kGEMMWorkspaceBufferIndex}));
+    TF_RETURN_IF_ERROR(CollectSliceInfo(
+        buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
+        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+        arg_idx));
+    fake_allocations[arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/arg_idx, workspace->size(), /*color=*/0);
+    slice_workspace_fake = BufferAllocation::Slice(
+        fake_allocations[arg_idx].get(), 0, workspace->size());
+  }
+
+  if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
+        return slice_instr == nullptr;
+      })) {
+    return absl::InternalError(
+        "AddressComputationFusion expects at least one sliced "
+        "operand/result");
   }
 
   bool deterministic_ops =
@@ -223,9 +338,57 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
   TF_ASSIGN_OR_RETURN(
       GemmConfig config,
       GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
-  auto thunk = std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),
-      lhs_slice, rhs_slice, output, workspace, deterministic_ops);
+
+  std::unique_ptr<Thunk> thunk;
+  auto thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(&custom_call);
+
+  if (absl::c_any_of(slice_instrs, [&](auto slice_instr) {
+        return DynCastOrNull<HloDynamicIndexInstruction>(slice_instr) !=
+               nullptr;
+      })) {
+    // Creating embedded GEMM thunk.
+    unsigned fake_arg_idx = 0;
+    int64_t lhs_byte_size =
+        ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, lhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_lhs_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, lhs_byte_size);
+
+    fake_arg_idx++;
+    int64_t rhs_byte_size =
+        ShapeUtil::ByteSizeOf(custom_call.operand(fake_arg_idx)->shape());
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, rhs_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_rhs_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, rhs_byte_size);
+
+    fake_arg_idx++;
+    int64_t out_fake_byte_size = ShapeUtil::ByteSizeOf(
+        custom_call.shape().IsArray() ? custom_call.shape()
+                                      : custom_call.shape().tuple_shapes(0));
+    fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+        /*index=*/fake_arg_idx, out_fake_byte_size, /*color=*/0);
+    BufferAllocation::Slice slice_out_fake(fake_allocations[fake_arg_idx].get(),
+                                           0, out_fake_byte_size);
+    ThunkSequence seq;
+    seq.emplace_back(std::make_unique<GemmThunk>(
+        thunk_info, std::move(config), slice_lhs_fake, slice_rhs_fake,
+        slice_out_fake, slice_workspace_fake, deterministic_ops));
+
+    std::vector<std::optional<const BufferAllocation::Slice>> arguments{
+        lhs_slice, rhs_slice, output, workspace};
+
+    thunk = std::make_unique<AddressComputationThunk>(
+        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
+        std::move(arguments), std::move(fake_allocations),
+        std::move(offset_buffer_indices), std::move(orig_shapes),
+        std::move(sliced_shapes), std::move(offset_byte_sizes));
+  } else {
+    thunk = std::make_unique<GemmThunk>(thunk_info, std::move(config),
+                                        lhs_slice, rhs_slice, output, workspace,
+                                        deterministic_ops);
+  }
 
   FusionEmissionResult result;
   result.thunks.push_back(std::move(thunk));
@@ -239,7 +402,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   const BufferAssignment& buffer_assignment =
       ir_emitter_context.buffer_assignment();
 
-  const std::string call_target_name = custom_call.custom_call_target();
+  const std::string& call_target_name = custom_call.custom_call_target();
 
   // Typed FFI custom calls is a replacement for legacy custom calls with
   // a rich type safe API. It's under construction and not fully supported.
@@ -249,12 +412,12 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(ir_emitter_context.platform_name()));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(call_target_name, ir_emitter_context.platform_name());
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && registration.ok();
 
   if (!found_custom_call && !found_ffi_handler) {
     return absl::InternalError(
@@ -264,30 +427,55 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
 
   using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
 
+  int64_t num_args = ShapeUtil::GetLeafCount(custom_call.shape());
+  absl::c_for_each(custom_call.operands(), [&](auto* operand) {
+    num_args += ShapeUtil::GetLeafCount(operand->shape());
+  });
+
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
+      offset_buffer_indices(num_args, std::nullopt);
+  std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
+  std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
+  std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
+                                                         std::nullopt);
+
+  std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
+  std::vector<std::optional<const BufferAllocation::Slice>> arguments;
+
+  unsigned arg_idx = 0;
+  // TODO(vuson): add test for custom call with token-typed operands
   Slices operands;
-  // TODO(vuson): add test with custom call with tuple-typed operands
   for (auto* operand : custom_call.operands()) {
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
           if (subshape.IsToken()) {
+            arg_idx++;
             operands.push_back(std::nullopt);
             return absl::OkStatus();
           }
           if (!subshape.IsArray()) {
             return absl::OkStatus();
           }
-          TF_ASSIGN_OR_RETURN(auto slice, GetSliceWithUpdatedOffsetAndSize(
-                                              buffer_assignment, adaptor,
-                                              fusion, *operand, index));
+          TF_ASSIGN_OR_RETURN(
+              auto slice,
+              GetOperandSlice(buffer_assignment, adaptor, fusion, *operand,
+                              slice_instrs, /*shape_idx=*/index, arg_idx));
+          TF_RETURN_IF_ERROR(CollectSliceInfo(
+              buffer_assignment, fusion,
+              absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
+              orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+
           operands.push_back(CustomCallThunk::Slice{slice, subshape});
+          arguments.push_back(slice);
           return absl::OkStatus();
         }));
   }
 
   Slices results;
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      fusion.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+      custom_call.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
         if (subshape.IsToken()) {
+          arg_idx++;
           results.push_back(std::nullopt);
           return absl::OkStatus();
         }
@@ -295,11 +483,27 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           return absl::OkStatus();
         }
         TF_ASSIGN_OR_RETURN(
-            auto slice, GetAllocationSlice(buffer_assignment, &fusion, index));
+            auto slice,
+            GetResultSlice(buffer_assignment, adaptor, fusion, custom_call,
+                           slice_instrs, /*shape_idx=*/index, arg_idx));
+        TF_RETURN_IF_ERROR(CollectSliceInfo(
+            buffer_assignment, fusion,
+            absl::Span<HloInstruction*>(slice_instrs), offset_buffer_indices,
+            orig_shapes, sliced_shapes, offset_byte_sizes, arg_idx++));
+
         results.push_back(CustomCallThunk::Slice{slice, subshape});
+        arguments.push_back(slice);
         return absl::OkStatus();
       }));
 
+  if (absl::c_all_of(slice_instrs, [&](auto slice_instr) {
+        return slice_instr == nullptr;
+      })) {
+    return absl::InternalError(
+        "AddressComputationFusion expects at least one sliced "
+        "operand/result");
+  }
+
   // For legacy custom calls we convert all API versions into the latest
   // status-returning one and pass backend config as an opaque string.
   CustomCallThunk::CustomCallTarget custom_call_target;
@@ -372,22 +576,101 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
                       custom_call.api_version());
   }
 
-  auto ffi_thunk = [&] {
+  std::unique_ptr<Thunk> thunk;
+  auto thunk_info = Thunk::ThunkInfo::WithProfileAnnotation(&custom_call);
+
+  auto ffi_thunk = [&](Slices ops, Slices res) {
     auto& called_computations = custom_call.called_computations();
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), *handler,
-        std::move(operands), std::move(results), std::move(attributes),
+        thunk_info, registration->handler, std::move(ops), std::move(res),
+        std::move(attributes),
         called_computations.empty() ? nullptr : called_computations[0]);
   };
 
-  auto legacy_thunk = [&] {
+  auto legacy_thunk = [&](Slices ops, Slices res) {
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
-        std::move(custom_call_target), std::move(operands), std::move(results),
-        std::move(opaque));
+        thunk_info, std::move(custom_call_target), std::move(ops),
+        std::move(res), std::move(opaque));
   };
+
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(num_args);
+  if (absl::c_any_of(slice_instrs, [&](auto slice_instr) {
+        return DynCastOrNull<HloDynamicIndexInstruction>(slice_instr) !=
+               nullptr;
+      })) {
+    // Creating embedded custom call thunk.
+    unsigned fake_arg_idx = 0;
+
+    Slices fake_operands;
+    for (auto* operand : custom_call.operands()) {
+      TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+          operand->shape(),
+          [&](const Shape& subshape, const ShapeIndex& index) {
+            if (subshape.IsToken()) {
+              fake_arg_idx++;
+              fake_operands.push_back(std::nullopt);
+              return absl::OkStatus();
+            }
+            if (!subshape.IsArray()) {
+              return absl::OkStatus();
+            }
+
+            int64_t operand_byte_size = ShapeUtil::ByteSizeOf(subshape);
+            fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+                /*index=*/fake_arg_idx, operand_byte_size, /*color=*/0);
+            BufferAllocation::Slice fake_slice(
+                fake_allocations[fake_arg_idx].get(), 0, operand_byte_size);
+
+            fake_arg_idx++;
+            fake_operands.push_back(
+                CustomCallThunk::Slice{fake_slice, subshape});
+            return absl::OkStatus();
+          }));
+    }
+
+    Slices fake_results;
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        custom_call.shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsToken()) {
+            fake_arg_idx++;
+            fake_results.push_back(std::nullopt);
+            return absl::OkStatus();
+          }
+          if (!subshape.IsArray()) {
+            return absl::OkStatus();
+          }
+
+          int64_t result_byte_size = ShapeUtil::ByteSizeOf(subshape);
+          fake_allocations[fake_arg_idx] = std::make_unique<BufferAllocation>(
+              /*index=*/fake_arg_idx, result_byte_size, /*color=*/0);
+          BufferAllocation::Slice fake_slice(
+              fake_allocations[fake_arg_idx].get(), 0, result_byte_size);
+
+          fake_arg_idx++;
+          fake_results.push_back(CustomCallThunk::Slice{fake_slice, subshape});
+          return absl::OkStatus();
+        }));
+
+    ThunkSequence seq;
+    seq.emplace_back(
+        found_ffi_handler
+            ? ffi_thunk(std::move(fake_operands), std::move(fake_results))
+            : legacy_thunk(std::move(fake_operands), std::move(fake_results)));
+
+    thunk = std::make_unique<AddressComputationThunk>(
+        thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
+        std::move(arguments), std::move(fake_allocations),
+        std::move(offset_buffer_indices), std::move(orig_shapes),
+        std::move(sliced_shapes), std::move(offset_byte_sizes));
+  } else {
+    thunk = found_ffi_handler
+                ? ffi_thunk(std::move(operands), std::move(results))
+                : legacy_thunk(std::move(operands), std::move(results));
+  }
+
   FusionEmissionResult result;
-  result.thunks.push_back(found_ffi_handler ? ffi_thunk() : legacy_thunk());
+  result.thunks.push_back(std::move(thunk));
   return result;
 }
 
@@ -465,5 +748,26 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
   return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
 }
 
+absl::StatusOr<FusionEmissionResult> DynamicAddressComputationFusion::Emit(
+    IrEmitterContext& ir_emitter_context,
+    const HloFusionInstruction& fusion) const {
+  const HloFusionAdaptor& adaptor = analysis_.fusion();
+  auto maybe_custom_call_adaptor = HloFindIf(
+      adaptor.GetRoots(), adaptor,
+      [](auto node) { return node.opcode() == HloOpcode::kCustomCall; });
+  if (maybe_custom_call_adaptor == std::nullopt) {
+    return absl::InternalError(
+        "DynamicAddressComputationFusion requires a CustomCall hero");
+  }
+
+  const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
+      &maybe_custom_call_adaptor->instruction());
+  if (IsLegacyCublasMatmul(custom_call)) {
+    return EmitGemm(ir_emitter_context, adaptor, fusion, custom_call);
+  }
+
+  return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.h b/third_party/xla/xla/service/gpu/fusions/custom.h
index e5f763027f754b..24f82865e05354 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.h
+++ b/third_party/xla/xla/service/gpu/fusions/custom.h
@@ -36,7 +36,7 @@ class CustomFusion : public FusionInterface {
 
 // Emitter for custom fusions implementing address computation. An address
 // computation contains a custom call hero, with at least one of its operands
-// comes from a static contiguous slice. E.g. operand `%cast` of `%gemm` coming
+// coming from a static contiguous slice. E.g. operand `%cast` of `%gemm` coming
 // from `%slice`:
 // %address_computation {
 //   %p0 = f32[2, 1024, 1024]
@@ -63,6 +63,28 @@ class AddressComputationFusion : public FusionInterface {
   const HloFusionAnalysis& analysis_;
 };
 
+// TODO(vuson): merge these two fusions.
+// Emitter for custom fusions implementing dynamic address computation. A
+// dynamic address computation contains a custom call hero, with at least one of
+// its operands coming from a dynamic contiguous slice, and/or with at least one
+// of its results feeding into a contiguous DUS.
+//
+// The goal is to compute the buffer addresses for sliced operands/results
+// without having to allocate new buffers for these by wrapping
+// AddressComputationThunk around the original custom call thunk.
+class DynamicAddressComputationFusion : public FusionInterface {
+ public:
+  explicit DynamicAddressComputationFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index d4d3a33ce57e78..e652532fd0464c 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/target_util.h"
@@ -117,10 +118,10 @@ absl::Status AnnotateKernelLaunchDimensions(
   return absl::OkStatus();
 }
 
-IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
-    const LaunchDimensions& launch_dims, int unroll_factor,
-    const Shape& output_shape, mlir::MLIRContext* ctx) {
-  std::vector<mlir::AffineExpr> output_dims(output_shape.rank());
+IndexingMap KernelFusionInterface::GetDefaultThreadIdIndexingMap(
+    const LaunchDimensions& launch_dims, int unroll_factor, const Shape& shape,
+    mlir::MLIRContext* ctx) {
+  std::vector<mlir::AffineExpr> output_dims(shape.rank());
 
   std::array<uint64_t, 3> thread_counts{
       launch_dims.thread_counts_per_block().x,
@@ -163,33 +164,32 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
 
   // See IndexUtil::LinearIndexToMultidimensionalIndex.
   uint64_t divisor = 1;
-  for (auto dimension : LayoutUtil::MinorToMajor(output_shape)) {
-    output_dims[dimension] =
-        (linear_index.floorDiv(divisor)) %
-        static_cast<uint64_t>(output_shape.dimensions(dimension));
-    divisor *= output_shape.dimensions(dimension);
+  for (auto dimension : LayoutUtil::MinorToMajor(shape)) {
+    output_dims[dimension] = (linear_index.floorDiv(divisor)) %
+                             static_cast<uint64_t>(shape.dimensions(dimension));
+    divisor *= shape.dimensions(dimension);
   }
 
-  std::vector<Interval> dimension_ranges = {
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().x) - 1},
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().y) - 1},
-      {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().z) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().x) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().y) - 1},
-      {0, static_cast<int64_t>(launch_dims.block_counts().z) - 1},
+  std::vector<DimVar> dim_vars = {
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().x) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().y) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.thread_counts_per_block().z) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().x) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().y) - 1}},
+      {{0, static_cast<int64_t>(launch_dims.block_counts().z) - 1}},
   };
-  std::vector<Interval> symbol_ranges;
-  int64_t num_elements = ShapeUtil::ElementsIn(output_shape);
-  symbol_ranges.push_back(
-      {0, CeilOfRatio(num_elements,
-                      static_cast<int64_t>(launch_dims.launch_bound()) *
-                          unroll_factor) -
-              1});
-  symbol_ranges.push_back({0, unroll_factor - 1});
+  std::vector<RangeVar> range_vars;
+  int64_t num_elements = ShapeUtil::ElementsIn(shape);
+  range_vars.push_back(
+      {{0, CeilOfRatio(num_elements,
+                       static_cast<int64_t>(launch_dims.launch_bound()) *
+                           unroll_factor) -
+               1}});
+  range_vars.push_back({0, unroll_factor - 1});
   IndexingMap indexing_map(
       mlir::AffineMap::get(/*dimCount=*/6,
                            /*symbolCount=*/2, output_dims, ctx),
-      dimension_ranges, symbol_ranges);
+      dim_vars, range_vars, /*rt_vars=*/{});
   // Remove the unroll_elem_id symbol if unrolling divides num_elements.
   if (num_elements % unroll_factor == 0) {
     indexing_map.AddConstraint(linear_index.replace({{unroll_elem_id, c0}}),
@@ -197,7 +197,7 @@ IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
   } else {
     indexing_map.AddConstraint(linear_index, Interval{0, num_elements - 1});
   }
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   return indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index dbc8e8718debe0..dc8c399cbf42fd 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
 #include "xla/status.h"
@@ -91,12 +91,12 @@ class KernelFusionInterface : public FusionInterface {
 
  protected:
   // Returns the default mapping for the given launch dimensions: linearizes
-  // the thread index and then reshapes it into the output layout.
+  // the thread index and then reshapes it into the given layout.
   // Populates the ranges for d0, d1, d2, d3, d4, d5 from the thread counts and
   // block sizes in the given launch dimensions.
-  static IndexingMap GetDefaultThreadIdToOutputIndexingMap(
+  static IndexingMap GetDefaultThreadIdIndexingMap(
       const LaunchDimensions& launch_dims, int unroll_factor,
-      const Shape& output_shape, mlir::MLIRContext* ctx);
+      const Shape& shape, mlir::MLIRContext* ctx);
 };
 
 // Base class for fusions that are implemented using a single kernel, which is
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 6dd5bf20d80152..5c192c7a5ca9fc 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/custom.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/in_place_dynamic_update_slice.h"
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
 #include "xla/service/gpu/fusions/input_slices.h"
 #include "xla/service/gpu/fusions/input_slices_mlir.h"
 #include "xla/service/gpu/fusions/loop.h"
@@ -178,6 +179,9 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       if (config.name() == "address_computation") {
         return std::make_unique<AddressComputationFusion>(analysis);
       }
+      if (config.name() == "dynamic_address_computation") {
+        return std::make_unique<DynamicAddressComputationFusion>(analysis);
+      }
       return std::make_unique<CustomFusion>();
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
@@ -188,6 +192,11 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
       if (IsDynamicUpdateSliceFusion(analysis) &&
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
+        if (check_mlir_emitters(
+                MlirInPlaceDynamicUpdateSliceFusion::IsSupported)) {
+          return std::make_unique<MlirInPlaceDynamicUpdateSliceFusion>(
+              analysis);
+        }
         return std::make_unique<InPlaceDynamicUpdateSliceFusion>(analysis);
       }
 
@@ -212,13 +221,13 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
       return std::make_unique<ScatterFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kTranspose: {
-      if (check_mlir_emitters(MlirTransposeFusion::IsSupported)) {
+      if (check_mlir_emitters(nullptr)) {
         return std::make_unique<MlirTransposeFusion>(analysis);
       }
       return std::make_unique<TransposeFusion>(analysis);
     }
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
-      if (check_mlir_emitters(MlirConcatenateFusion::IsSupported)) {
+      if (check_mlir_emitters(nullptr)) {
         return std::make_unique<MlirConcatenateFusion>(analysis);
       }
       return std::make_unique<ConcatenateFusion>(analysis);
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index 1fedb880c88f89..ea4c56fa576b14 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -34,12 +34,32 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+namespace {
+
+constexpr int kDUSUpdateIndex = 1;
+
+}  // namespace
 
 LaunchDimensions InPlaceDynamicUpdateSliceFusion::launch_dimensions() const {
   const auto& update_shape = dus_ops_.front()->operand(1)->shape();
   return CalculateLaunchDimensions(update_shape, analysis_.device_info());
 }
 
+std::optional<IndexingMap>
+InPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* mlir_context) const {
+  if (hero_operand_index != kDUSUpdateIndex) {
+    return std::nullopt;
+  }
+  auto launch_dims = launch_dimensions();
+  // It is guaranteed that all DUS ops have the same output shape at this point.
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return GetDefaultThreadIdIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                       update_shape, mlir_context);
+}
+
 absl::Status InPlaceDynamicUpdateSliceFusion::EmitKernel(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     const LaunchDimensions& launch_dims, std::vector<llvm_ir::IrArray> inputs,
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index 12be8043b05ec1..213f7e7ecbdeab 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -75,10 +75,7 @@ class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
new file mode 100644
index 00000000000000..eccdcfceee8a8e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.cc
@@ -0,0 +1,169 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
+#include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using llvm::SmallVector;
+using mlir::ImplicitLocOpBuilder;
+using mlir::MLIRContext;
+using mlir::Value;
+using mlir::ValueRange;
+using mlir::arith::AddIOp;
+using mlir::func::ReturnOp;
+using mlir::tensor::InsertOp;
+using mlir_converter::ApplyAffineMap;
+using mlir_converter::CallTargetProvider;
+using mlir_converter::ClampIndex;
+using mlir_converter::PartitionedComputations;
+using mlir_converter::ProvideParameter;
+
+constexpr int kDUSUpdateIndex = 1;
+
+}  // namespace
+
+/*static*/ bool MlirInPlaceDynamicUpdateSliceFusion::IsSupported(
+    const HloFusionAnalysis& analysis) {
+  return analysis.fusion_roots().size() == 1;
+}
+
+LaunchDimensions MlirInPlaceDynamicUpdateSliceFusion::launch_dimensions()
+    const {
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return CalculateLaunchDimensions(update_shape, analysis_.device_info());
+}
+
+std::optional<IndexingMap>
+MlirInPlaceDynamicUpdateSliceFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* mlir_context) const {
+  // TODO(b/331355203): Implement thread ID -> operand indexing.
+  if (hero_operand_index != kDUSUpdateIndex) {
+    return std::nullopt;
+  }
+  auto launch_dims = launch_dimensions();
+  // It is guaranteed that all DUS ops have the same output shape at this point.
+  const auto& update_shape =
+      dus_ops_.front()->operand(kDUSUpdateIndex)->shape();
+  return GetDefaultThreadIdIndexingMap(launch_dims, /*unroll_factor=*/1,
+                                       update_shape, mlir_context);
+}
+
+std::vector<const HloInstruction*>
+MlirInPlaceDynamicUpdateSliceFusion::GetInstructionsWithCustomCodegen(
+    const HloFusionInstruction& fusion) const {
+  return dus_ops_;
+}
+
+absl::Status MlirInPlaceDynamicUpdateSliceFusion::EmitEntryFunction(
+    const PartitionedComputations& computations,
+    const CallTargetProvider& call_targets, mlir::func::FuncOp entry_function,
+    const HloFusionInstruction& fusion) const {
+  ImplicitLocOpBuilder b(entry_function.getLoc(), entry_function);
+  b.setInsertionPointToStart(entry_function.addEntryBlock());
+
+  mlir::MLIRContext* mlir_context = entry_function.getContext();
+
+  auto indexing = *ComputeThreadIdToInputIndexing(
+      /*root_index=*/0,
+      /*hero_operand_index=*/kDUSUpdateIndex, mlir_context);
+  indexing.Simplify(GetIndexingMapForInstruction);
+  indexing.RemoveUnusedSymbols();
+
+  int num_inputs = fusion.fused_instructions_computation()->num_parameters();
+  auto output_tensor_args =
+      entry_function.getArguments().drop_front(num_inputs);
+
+  const auto& root_computation = computations.FindPartitionedComputation(
+      fusion.fused_instructions_computation());
+  const auto& dus_subgraph = root_computation.FindSubgraph(dus_ops_.front());
+
+  const auto* dus_instr =
+      Cast<HloDynamicUpdateSliceInstruction>(dus_ops_.front());
+  const auto& update_shape = dus_instr->update()->shape();
+  auto result_tensors = EmitThreadLoopNest(
+      b, output_tensor_args, indexing,
+      [&](ValueRange output_tensors, ValueRange dim_values,
+          ValueRange symbol_values) -> llvm::SmallVector<Value> {
+        auto input_indices = ApplyAffineMap(indexing.GetAffineMap(), dim_values,
+                                            symbol_values, b);
+        SmallVector<Value> update_indices;
+        for (int i = 0; i < update_shape.rank(); ++i) {
+          int64_t update_size = update_shape.dimensions(i);
+          auto start_index =
+              ProvideParameter(dus_subgraph, dus_instr,
+                               i + dus_instr->first_index_operand_number(), {},
+                               call_targets, entry_function, b)[0];
+          start_index = ClampIndex(
+              start_index,
+              primitive_util::IsUnsignedIntegralType(
+                  dus_instr
+                      ->operand(i + dus_instr->first_index_operand_number())
+                      ->shape()
+                      .element_type()),
+              dus_instr->shape().dimensions(i) - update_size, b);
+
+          update_indices.push_back(
+              b.create<AddIOp>(input_indices[i], start_index));
+        }
+
+        auto updated_value =
+            ProvideParameter(dus_subgraph, dus_instr, kDUSUpdateIndex,
+                             input_indices, call_targets, entry_function, b)[0];
+        auto insert = b.create<InsertOp>(updated_value, output_tensors[0],
+                                         update_indices);
+
+        return {insert.getResult()};
+      });
+
+  b.create<ReturnOp>(result_tensors);
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
new file mode 100644
index 00000000000000..bac44f13144cd3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h
@@ -0,0 +1,81 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+#define XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/status.h"
+
+namespace xla {
+namespace gpu {
+
+// Fusion node where the root is either:
+// 1. a dynamic-update-slice op
+// 2. a bitcast of a dynamic-update-slice op
+// 3. a tuple op returning the result of several dynamic-update-slice ops
+// 4. a tuple op returning the result of several bitcast
+//    dynamic-update-slice ops
+//
+// Lowers to LLVM via MLIR.
+class MlirInPlaceDynamicUpdateSliceFusion : public MlirFusionEmitterBase {
+ public:
+  explicit MlirInPlaceDynamicUpdateSliceFusion(
+      const HloFusionAnalysis& analysis)
+      : analysis_(analysis),
+        dus_ops_(
+            GetOutputDefiningDynamicUpdateSlices(analysis.fusion_roots())) {}
+
+  static bool IsSupported(const HloFusionAnalysis& analysis);
+
+  LaunchDimensions launch_dimensions() const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
+      int64_t root_index, mlir::MLIRContext* indexing_context) const override {
+    // The mapping cannot be statically computed in general, since the offsets
+    // are unknown.
+    return std::nullopt;
+  }
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* indexing_context) const override;
+
+ protected:
+  absl::Status EmitEntryFunction(
+      const mlir_converter::PartitionedComputations& computations,
+      const mlir_converter::CallTargetProvider& call_targets,
+      mlir::func::FuncOp entry_function,
+      const HloFusionInstruction& fusion) const override;
+
+  std::vector<const HloInstruction*> GetInstructionsWithCustomCodegen(
+      const HloFusionInstruction& fusion) const override;
+
+ private:
+  const HloFusionAnalysis& analysis_;
+  std::vector<const HloInstruction*> dus_ops_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSIONS_IN_PLACE_DYNAMIC_UPDATE_SLICE_MLIR_H_
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
new file mode 100644
index 00000000000000..3aabb901b498c9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir_test.cc
@@ -0,0 +1,181 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice_mlir.h"
+
+#include <gtest/gtest.h>
+#include "xla/error_spec.h"
+#include "xla/service/gpu/fusions/mlir_emitter_test_base.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using MlirInPlaceDynamicUpdateSliceFusionTest =
+    MlirEmitterTestBase<MlirInPlaceDynamicUpdateSliceFusion>;
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )"));
+  thread_id_printer_.SetSymbolName(0, "chunk_id");
+  thread_id_printer_.SetSymbolName(1, "unroll_id");
+
+  auto* root = module->entry_computation()->root_instruction();
+
+  auto analysis = AnalyzeFusion(*root, device_info_);
+  MlirInPlaceDynamicUpdateSliceFusion fusion(analysis);
+
+  auto thread_id_update_indexing = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_update_indexing->ToString(thread_id_printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    th_x floordiv 6, th_x mod 6)
+    domain:
+    th_x in [0, 29]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+  )"));
+  auto thread_id_dst_indexing = fusion.ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_dst_indexing, ::testing::Eq(std::nullopt));
+}
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, SimpleDUS) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 6)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 6)>
+    // CHECK:     func.func @fused_computation
+    // CHECK-SAME:  %arg0: tensor<20x30xf32>
+    // CHECK-SAME:  %arg1: tensor<5x6xf32>
+    // CHECK-SAME:  %arg2: tensor<i32>
+    // CHECK-SAME:  %arg3: tensor<i32>
+    // CHECK-SAME:  %arg4: tensor<20x30xf32>
+    // CHECK-DAG:   %[[C_24:.*]] = arith.constant 24
+    // CHECK-DAG:   %[[C_15:.*]] = arith.constant 15
+    // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
+    // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
+    // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_15]]
+    // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
+    // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
+    // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
+    // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_24]]
+    // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
+    // CHECK:       %[[ADD1:.*]] = arith.addi %[[INPUT_INDEX_1]], %[[MAX1]]
+    // CHECK:       %[[UPDATE:.*]] = xla_gpu.pure_call @fused_computation_updates
+    // CHECK:       %[[INSERT:.*]] = tensor.insert %[[UPDATE:.*]] into %arg4[%[[ADD0]], %[[ADD1]]]
+    // CHECK:       return %[[INSERT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirInPlaceDynamicUpdateSliceFusionTest, OutOfBoundDUS) {
+  auto kHloString = R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[7,8] parameter(0)
+      updates = f32[2,3] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[7,8] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[7,8] parameter(0)
+      updates = f32[2,3] parameter(1)
+      i0 = s32[] constant(-20)
+      i1 = s32[] constant(30)
+      ROOT fusion = f32[7,8] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK(EmitAndCheckIR(kHloString, R"(
+    // CHECK-DAG: #[[MAP_1:.*]] = affine_map<()[s0] -> (s0 floordiv 3)>
+    // CHECK-DAG: #[[MAP_2:.*]] = affine_map<()[s0] -> (s0 mod 3)>
+    // CHECK:     func.func @fused_computation
+    // CHECK-SAME:  %arg0: tensor<7x8xf32>
+    // CHECK-SAME:  %arg1: tensor<2x3xf32>
+    // CHECK-SAME:  %arg2: tensor<i32>
+    // CHECK-SAME:  %arg3: tensor<i32>
+    // CHECK-SAME:  %arg4: tensor<7x8xf32>
+    // CHECK-DAG:   %[[C_5:.*]] = arith.constant 5
+    // CHECK-DAG:   %[[C_0:.*]] = arith.constant 0
+    // CHECK:       %[[THREAD_ID:.*]] = gpu.thread_id  x
+    // CHECK:       %[[INPUT_INDEX_0:.*]] = affine.apply #[[MAP_1]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[INPUT_INDEX_1:.*]] = affine.apply #[[MAP_2]]()[%[[THREAD_ID]]]
+    // CHECK:       %[[I0:.*]] = xla_gpu.pure_call @fused_computation_i0
+    // CHECK:       %[[IDX0:.*]] = arith.index_cast %[[I0]]
+    // CHECK:       %[[MIN0:.*]] = arith.minsi %[[IDX0]], %[[C_5]]
+    // CHECK:       %[[MAX0:.*]] = arith.maxsi %[[MIN0]], %[[C_0]]
+    // CHECK:       %[[ADD0:.*]] = arith.addi %[[INPUT_INDEX_0]], %[[MAX0]]
+    // CHECK:       %[[I1:.*]] = xla_gpu.pure_call @fused_computation_i1
+    // CHECK:       %[[IDX1:.*]] = arith.index_cast %[[I1]]
+    // CHECK:       %[[MIN1:.*]] = arith.minsi %[[IDX1]], %[[C_5]]
+    // CHECK:       %[[MAX1:.*]] = arith.maxsi %[[MIN1]], %[[C_0]]
+    // CHECK:       %[[ADD1:.*]] = arith.addi %[[INPUT_INDEX_1]], %[[MAX1]]
+    // CHECK:       %[[UPDATE:.*]] = xla_gpu.pure_call @fused_computation_updates
+    // CHECK:       %[[INSERT:.*]] = tensor.insert %[[UPDATE:.*]] into %arg4[%[[ADD0]], %[[ADD1]]]
+    // CHECK:       return %[[INSERT]]
+  )"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
new file mode 100644
index 00000000000000..c0382560399c14
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/in_place_dynamic_update_slice.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class InPlaceDynamicUpdateSliceFusionTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
+
+TEST_F(InPlaceDynamicUpdateSliceFusionTest, ThreadIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] parameter(2)
+      i1 = s32[] parameter(3)
+      ROOT updated = f32[20,30] dynamic-update-slice(in, updates, i0, i1)
+    }
+    ENTRY entry {
+      in = f32[20,30] parameter(0)
+      updates = f32[5,6] parameter(1)
+      i0 = s32[] constant(2)
+      i1 = s32[] constant(3)
+      ROOT fusion = f32[20,30] fusion(in, updates, i0, i1), kind=kLoop, calls=fused_computation
+    }
+  )")
+                    .value();
+
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<InPlaceDynamicUpdateSliceFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  auto thread_id_update_indexing = fusion->ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/1, &mlir_context_);
+  EXPECT_THAT(thread_id_update_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    th_x floordiv 6, th_x mod 6)
+    domain:
+    th_x in [0, 29]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 0]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+  )"));
+  auto thread_id_dst_indexing = fusion->ComputeThreadIdToInputIndexing(
+      /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_dst_indexing, ::testing::Eq(std::nullopt));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 85f661a8f125f5..225de1da8be49b 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -190,8 +190,7 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
 }
 
 absl::Status InputSlicesFusion::EmitKernel(
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
index c1108ca37e8cd3..16c4fbf5062343 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_mlir.cc
@@ -59,8 +59,7 @@ MlirInputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
-                                               shape, ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, unroll_factor_, shape, ctx);
 }
 
 LaunchDimensions MlirInputSlicesFusion::launch_dimensions() const {
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e7a13200fe391f..e417f96923f4e3 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -217,8 +217,8 @@ LoopFusion::LoopFusion(const HloFusionAnalysis& analysis)
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetElementShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, config_.unroll_factor,
+                                       GetElementShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
@@ -239,7 +239,7 @@ std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify();
+  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
index 82734d06cc9c9a..bf41d50930ea95 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_mlir.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
 #include "xla/xla_data.pb.h"
@@ -65,8 +66,8 @@ const Shape& GetFusionResultShape(const HloFusionAnalysis& analysis) {
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToOutputIndexing(
     int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  return GetDefaultThreadIdToOutputIndexingMap(
-      launch_dims, config_.unroll_factor, GetFusionResultShape(analysis_), ctx);
+  return GetDefaultThreadIdIndexingMap(launch_dims, config_.unroll_factor,
+                                       GetFusionResultShape(analysis_), ctx);
 }
 
 std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
@@ -87,7 +88,7 @@ std::optional<IndexingMap> MlirLoopFusion::ComputeThreadIdToInputIndexing(
   CHECK_EQ(output_to_input_indexing_set.size(), 1);
   IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
       *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
-  thread_id_to_input_indexing_map.Simplify();
+  thread_id_to_input_indexing_map.Simplify(GetIndexingMapForInstruction);
   return thread_id_to_input_indexing_map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
index 9f08ec5c244089..3e475b1a9cdbeb 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/BUILD
@@ -33,6 +33,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -69,24 +70,29 @@ cc_library(
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/mlir_hlo:type_conversion",
+        "//xla/service:algorithm_util",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -174,9 +180,11 @@ cc_library(
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:ControlFlowDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
@@ -218,9 +226,10 @@ xla_cc_test(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationInterfaces",
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:GPUDialect",
@@ -276,6 +285,7 @@ cc_library(
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu/fusions/mlir/ir:xla_gpu",
+        "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -291,6 +301,7 @@ cc_library(
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
index 472efdb0197501..ad46b914a9e484 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/computation_partitioner.cc
@@ -341,15 +341,19 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
     return *ConvertPrimitiveTypeToMlirType(shape.element_type(), b);
   };
 
-  const xla::Shape* one_root_shape = nullptr;
+  const xla::Shape* first_root_shape = nullptr;
   for (auto* root : subgraph.roots) {
     if (root->shape().IsTuple()) {
       for (auto& shape : root->shape().tuple_shapes()) {
-        one_root_shape = &shape;
+        if (!first_root_shape) {
+          first_root_shape = &shape;
+        }
         result_types.push_back(element_type(shape));
       }
     } else {
-      one_root_shape = &root->shape();
+      if (!first_root_shape) {
+        first_root_shape = &root->shape();
+      }
       result_types.push_back(element_type(root->shape()));
     }
   }
@@ -362,13 +366,13 @@ mlir::func::FuncOp CreateSubgraphMlirFunction(
       parameter_types.push_back(TensorShapeToMlirType(param->shape(), b));
       arg_attrs.emplace_back();
     }
-    for (int dim = 0; dim < one_root_shape->rank(); ++dim) {
+    for (int dim = 0; dim < first_root_shape->rank(); ++dim) {
       parameter_types.push_back(b.getIndexType());
       arg_attrs.emplace_back(mlir::DictionaryAttr::get(
           b.getContext(),
-          {b.getNamedAttr(
-              "xla.range",
-              b.getIndexArrayAttr({0, one_root_shape->dimensions(dim) - 1}))}));
+          {b.getNamedAttr("xla.range",
+                          b.getIndexArrayAttr(
+                              {0, first_root_shape->dimensions(dim) - 1}))}));
     }
 
     // Populate arguments for injected parameters (values that are computed
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
index 0701425c81ca5b..4e3e181f94c817 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
@@ -64,10 +67,13 @@ limitations under the License.
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/mlir_hlo/mhlo/utils/type_conversion.h"
 #include "xla/primitive_util.h"
+#include "xla/service/algorithm_util.h"
+#include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
@@ -83,9 +89,11 @@ namespace {
 using llvm::SmallVector;
 using llvm::SmallVectorImpl;
 using mlir::Block;
+using mlir::FloatType;
 using mlir::ImplicitLocOpBuilder;
 using mlir::IRMapping;
 using mlir::Location;
+using mlir::MLIRContext;
 using mlir::OpBuilder;
 using mlir::Value;
 using mlir::ValueRange;
@@ -159,8 +167,7 @@ static auto& kUnsupportedOps =
                                         HloOpcode::kCall};
 
 static auto& kUnimplementedOps = *new absl::flat_hash_set<HloOpcode>{
-    HloOpcode::kConvolution, HloOpcode::kDot, HloOpcode::kMap,
-    HloOpcode::kReduceWindow};
+    HloOpcode::kConvolution, HloOpcode::kMap};
 
 bool IsUnsupportedConstant(const HloInstruction* instr) {
   return instr->opcode() == HloOpcode::kConstant &&
@@ -186,12 +193,14 @@ bool IsUnsupportedTuple(const HloInstruction* instr) {
     return true;
   }
 
-  // All tuple elements must have the same dimensions (element types may
-  // differ).
+  // All tuple elements must have bitcast-compatible dimensions (element types
+  // may differ).
   auto first_shape = instr->shape().tuple_shapes(0);
   for (int i = 1; i < instr->operand_count(); ++i) {
-    if (instr->shape().tuple_shapes(i).dimensions() !=
-        first_shape.dimensions()) {
+    const auto& tuple_shape = instr->shape().tuple_shapes(i);
+    if (!ShapeUtil::EqualIgnoringElementType(tuple_shape, first_shape) &&
+        !ShapeUtil::IsReshapeOrTransposeBitcast(tuple_shape, first_shape,
+                                                /*ignore_element_type=*/true)) {
       return true;
     }
   }
@@ -246,57 +255,115 @@ absl::StatusOr<SmallVector<Value>> EmitReduce(
     const HloInstruction* instr, ValueRange indices,
     const OperandProvider& operand_provider,
     const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
-  SmallVector<Value> reduction_indices(indices);
-  SmallVector<Value> accumulators;
+  auto* mlir_context = b.getContext();
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, 0, mlir_context);
+  const auto& indexing_map = *indexing.indexing_maps[0].begin();
+
+  SmallVector<Value> init_values;
   for (int i = instr->operand_count() / 2; i < instr->operand_count(); ++i) {
-    TF_ASSIGN_OR_RETURN(accumulators.emplace_back(),
+    TF_ASSIGN_OR_RETURN(init_values.emplace_back(),
                         GetSingleOperandValue(operand_provider, instr, i, {}));
     // Convert back to signed type.
     TF_ASSIGN_OR_RETURN(auto element_mlir_type,
                         ConvertPrimitiveTypeToMlirType(
                             instr->operand(i)->shape().element_type(), b));
-    accumulators.back() = b.create<mlir::UnrealizedConversionCastOp>(
-                               element_mlir_type, accumulators.back())
-                              .getResult(0);
+    init_values.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                              element_mlir_type, init_values.back())
+                             .getResult(0);
   }
-  auto dims = llvm::to_vector(instr->dimensions());
-  absl::c_sort(dims);
-  ForOp outermost_loop = nullptr;
-  for (int dim : dims) {
-    auto bound = instr->operands()[0]->shape().dimensions(dim);
-    auto loop = b.create<ForOp>(b.create<ConstantIndexOp>(0),
-                                b.create<ConstantIndexOp>(bound),
-                                b.create<ConstantIndexOp>(1), accumulators);
-    if (outermost_loop == nullptr) {
-      outermost_loop = loop;
-    } else {
-      b.create<YieldOp>(loop.getResults());
+
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
+                                  symbol_values, b);
+
+    SmallVector<Value> args{iter_args};
+    for (int i = 0; i < instr->operand_count() / 2; ++i) {
+      TF_ASSIGN_OR_RETURN(
+          args.emplace_back(),
+          GetSingleOperandValue(operand_provider, instr, i, indices));
+      // Convert back to signed type.
+      TF_ASSIGN_OR_RETURN(auto element_mlir_type,
+                          ConvertPrimitiveTypeToMlirType(
+                              instr->operand(i)->shape().element_type(), b));
+      args.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                         element_mlir_type, args.back())
+                        .getResult(0);
     }
-    b.setInsertionPointToStart(loop.getBody());
-    reduction_indices.insert(reduction_indices.begin() + dim,
-                             loop.getInductionVar());
-    accumulators = {loop.getRegionIterArgs().begin(),
-                    loop.getRegionIterArgs().end()};
-  }
-  SmallVector<Value> args = accumulators;
-  for (int i = 0; i < instr->operand_count() / 2; ++i) {
+    auto reducer = call_target_provider(
+        instr->called_computations().front()->root_instruction());
+    return b.create<mlir::func::CallOp>(reducer, args).getResults();
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      EmitLoopNestWithStatus(b, indices, init_values, indexing_map, body));
+
+  return ConvertToSignless(result, b);
+}
+
+absl::StatusOr<SmallVector<Value>> EmitReduceWindow(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    const CallTargetProvider& call_target_provider, ImplicitLocOpBuilder& b) {
+  MLIRContext* mlir_context = b.getContext();
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, 0, mlir_context);
+  auto indexing_map = *indexing.indexing_maps[0].begin();
+  indexing_map.RescaleSymbols();
+
+  auto reduce_window = DynCast<HloReduceWindowInstruction>(instr);
+  CHECK(reduce_window != nullptr);
+
+  SmallVector<Value> init_values;
+  for (auto [index, init_value] :
+       llvm::enumerate(reduce_window->init_values())) {
     TF_ASSIGN_OR_RETURN(
-        args.emplace_back(),
-        GetSingleOperandValue(operand_provider, instr, i, reduction_indices));
+        init_values.emplace_back(),
+        GetSingleOperandValue(operand_provider, instr,
+                              reduce_window->input_count() + index, {}));
     // Convert back to signed type.
-    TF_ASSIGN_OR_RETURN(auto element_mlir_type,
-                        ConvertPrimitiveTypeToMlirType(
-                            instr->operand(i)->shape().element_type(), b));
-    args.back() = b.create<mlir::UnrealizedConversionCastOp>(element_mlir_type,
-                                                             args.back())
-                      .getResult(0);
+    TF_ASSIGN_OR_RETURN(
+        auto element_mlir_type,
+        ConvertPrimitiveTypeToMlirType(init_value->shape().element_type(), b));
+    init_values.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                              element_mlir_type, init_values.back())
+                             .getResult(0);
   }
-  auto reducer = call_target_provider(
-      instr->called_computations().front()->root_instruction());
-  b.create<YieldOp>(b.create<mlir::func::CallOp>(reducer, args).getResults());
 
-  b.setInsertionPointAfter(outermost_loop);
-  return ConvertToSignless(outermost_loop.getResults(), b);
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    auto indices = ApplyAffineMap(indexing_map.GetAffineMap(), dim_values,
+                                  symbol_values, b);
+
+    SmallVector<Value> args{iter_args};
+    for (auto [index, input] : llvm::enumerate(reduce_window->inputs())) {
+      TF_ASSIGN_OR_RETURN(
+          args.emplace_back(),
+          GetSingleOperandValue(operand_provider, instr, index, indices));
+
+      // Convert back to signed type.
+      TF_ASSIGN_OR_RETURN(
+          auto element_mlir_type,
+          ConvertPrimitiveTypeToMlirType(input->shape().element_type(), b));
+      args.back() = b.create<mlir::UnrealizedConversionCastOp>(
+                         element_mlir_type, args.back())
+                        .getResult(0);
+    }
+
+    auto reducer = call_target_provider(
+        instr->called_computations().front()->root_instruction());
+    return b.create<mlir::func::CallOp>(reducer, args).getResults();
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      EmitLoopNestWithStatus(b, indices, init_values, indexing_map, body));
+
+  return ConvertToSignless(result, b);
 }
 
 absl::StatusOr<SmallVector<Value>> EmitConcat(
@@ -491,6 +558,103 @@ absl::StatusOr<SmallVector<Value>> EmitPad(
   return if_op.getResults();
 }
 
+absl::StatusOr<Value> EmitFloatCast(Value value, mlir::Type target_type,
+                                    ImplicitLocOpBuilder& b) {
+  if (value.getType().getIntOrFloatBitWidth() <
+      target_type.getIntOrFloatBitWidth()) {
+    return b.create<arith::ExtFOp>(target_type, value);
+  }
+  if (value.getType().getIntOrFloatBitWidth() >
+      target_type.getIntOrFloatBitWidth()) {
+    return b.create<arith::TruncFOp>(target_type, value);
+  }
+  return value;
+}
+
+absl::StatusOr<Value> EmitMulAdd(Value lhs, Value rhs, Value accumulator,
+                                 mlir::Type result_element_type,
+                                 mlir::Type accumulator_type,
+                                 ImplicitLocOpBuilder& b) {
+  if (result_element_type.isa<FloatType>()) {
+    if (result_element_type.isBF16()) {
+      lhs = b.create<arith::ExtFOp>(b.getF32Type(), lhs);
+      rhs = b.create<arith::ExtFOp>(b.getF32Type(), rhs);
+    }
+    TF_ASSIGN_OR_RETURN(
+        Value casted,
+        EmitFloatCast(b.create<arith::MulFOp>(lhs, rhs), accumulator_type, b));
+    return b.create<arith::AddFOp>(accumulator, casted);
+  }
+  if (result_element_type.isInteger(1)) {
+    return b.create<arith::OrIOp>(accumulator,
+                                  b.create<arith::AndIOp>(lhs, rhs));
+  }
+  return b.create<arith::AddIOp>(accumulator,
+                                 b.create<arith::MulIOp>(lhs, rhs));
+}
+
+absl::StatusOr<SmallVector<Value>> EmitDot(
+    const HloInstruction* instr, mlir::Type result_element_type,
+    ValueRange indices, const OperandProvider& operand_provider,
+    ImplicitLocOpBuilder& b) {
+  VLOG(1) << "EmitDot: " << instr->ToString() << " "
+          << llvm_ir::DumpToString(result_element_type);
+
+  if (!algorithm_util::IsSupportedByElementalIrEmitter(
+          instr->precision_config().algorithm())) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Algorithm not supported by the ElementalIrEmitter: %s",
+                        PrecisionConfig::Algorithm_Name(
+                            instr->precision_config().algorithm())));
+  }
+  auto* dot = DynCast<HloDotInstruction>(instr);
+  TF_RET_CHECK(dot != nullptr);
+  if (dot->sparse_operands()) {
+    return absl::UnimplementedError(
+        "Sparse dot is supported by Triton emitter only.");
+  }
+
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, /*output_id=*/0, b.getContext());
+  const IndexingMap& lhs_indexing_map = *indexing.indexing_maps.at(0).begin();
+  const IndexingMap& rhs_indexing_map = *indexing.indexing_maps.at(1).begin();
+
+  const mlir::Type accumulator_type =
+      result_element_type.isBF16() ? b.getF32Type() : result_element_type;
+  Value accum_init_value =
+      b.create<ConstantOp>(b.getZeroAttr(accumulator_type)).getResult();
+
+  auto body =
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> absl::StatusOr<SmallVector<Value>> {
+    llvm::SmallVector<Value> lhs_indices = ApplyAffineMap(
+        lhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
+    llvm::SmallVector<Value> rhs_indices = ApplyAffineMap(
+        rhs_indexing_map.GetAffineMap(), dim_values, symbol_values, b);
+
+    TF_ASSIGN_OR_RETURN(Value lhs_value, GetSingleOperandValue(
+                                             operand_provider, instr,
+                                             /*operand_index=*/0, lhs_indices));
+    TF_ASSIGN_OR_RETURN(Value rhs_value, GetSingleOperandValue(
+                                             operand_provider, instr,
+                                             /*operand_index=*/1, rhs_indices));
+    Value accum = iter_args[0];
+
+    TF_ASSIGN_OR_RETURN(
+        accum, EmitMulAdd(lhs_value, rhs_value, accum, result_element_type,
+                          accumulator_type, b));
+    return {{accum}};
+  };
+
+  TF_ASSIGN_OR_RETURN(SmallVector<Value> results,
+                      EmitLoopNestWithStatus(b, indices, {accum_init_value},
+                                             lhs_indexing_map, body));
+  if (result_element_type.isBF16()) {
+    results[0] = b.create<arith::TruncFOp>(b.getBF16Type(), results[0]);
+  }
+  return results;
+}
+
 absl::StatusOr<SmallVector<Value>> EmitParameter(const HloInstruction* instr,
                                                  mlir::func::FuncOp this_fn,
                                                  ValueRange indices,
@@ -542,6 +706,8 @@ Value ApplyAffineExpr(mlir::AffineExpr expr, ValueRange dims,
 
 SmallVector<Value> ApplyAffineMap(mlir::AffineMap map, ValueRange dims,
                                   ValueRange symbols, ImplicitLocOpBuilder& b) {
+  CHECK_EQ(map.getNumDims(), dims.size());
+  CHECK_EQ(map.getNumSymbols(), symbols.size());
   SmallVector<Value> result;
   result.reserve(map.getNumResults());
   for (auto expr : map.getResults()) {
@@ -570,8 +736,8 @@ Value CheckConstraints(const IndexingMap& map, ValueRange dims,
         ret, CheckConstraint(ApplyAffineExpr(expression, dims, symbols, b),
                              range, b));
   }
-  for (auto&& [index, range] : llvm::enumerate(map.GetDimensionRanges())) {
-    ret = b.create<AndIOp>(ret, CheckConstraint(dims[index], range, b));
+  for (auto&& [index, bound] : llvm::enumerate(map.GetDimensionBounds())) {
+    ret = b.create<AndIOp>(ret, CheckConstraint(dims[index], bound, b));
   }
   return ret;
 }
@@ -604,6 +770,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     result_element_type = sign_converter.convertType(element_mlir_type);
   }
 
+  auto* mlir_context = builder.getContext();
   // Handle ops that aren't elementwise and aren't just indexing
   // transformations.
   switch (instr->opcode()) {
@@ -639,18 +806,39 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     case HloOpcode::kPad:
       return EmitPad(instr, result_element_type, indices, operand_provider,
                      builder);
+    case HloOpcode::kDot:
+      return EmitDot(instr, result_element_type, indices, operand_provider,
+                     builder);
     case HloOpcode::kParameter:
       return EmitParameter(instr, this_fn, indices, builder);
     case HloOpcode::kReduce:
       return EmitReduce(instr, indices, operand_provider, call_target_provider,
                         builder);
+    case HloOpcode::kReduceWindow:
+      return EmitReduceWindow(instr, result_element_type, indices,
+                              operand_provider, call_target_provider, builder);
     case HloOpcode::kTuple: {
       CHECK(!IsUnsupportedTuple(instr));
+      const auto& first_shape = instr->shape().tuple_shapes(0);
+      CHECK_EQ(first_shape.rank(), indices.size())
+          << "Indices for tuple must be for the first tuple element";
       SmallVector<Value> operands;
       for (int i = 0; i < instr->operand_count(); ++i) {
+        llvm::SmallVector<Value> operand_indices;
+        // The tuple shapes only need to be bitcast compatible, so insert
+        // bitcasts where necessary.
+        if (i > 0 && !ShapeUtil::EqualIgnoringElementType(
+                         first_shape, instr->operand(i)->shape())) {
+          auto operand_map = GetBitcastMap(
+              first_shape, instr->operand(i)->shape(), mlir_context);
+          operand_indices =
+              ApplyAffineMap(operand_map.GetAffineMap(), indices, {}, builder);
+        } else {
+          operand_indices = indices;
+        }
         TF_ASSIGN_OR_RETURN(
             operands.emplace_back(),
-            GetSingleOperandValue(operand_provider, instr, i, indices));
+            GetSingleOperandValue(operand_provider, instr, i, operand_indices));
       }
       return operands;
     }
@@ -674,8 +862,7 @@ absl::StatusOr<SmallVector<Value>> HloToMlir(
     arg_types.push_back(operand_element_type);
   }
   auto input_indices = GetInputIndices(
-      ComputeOutputToInputIndexing(instr, 0, builder.getContext()), indices,
-      builder);
+      ComputeOutputToInputIndexing(instr, 0, mlir_context), indices, builder);
   SmallVector<Value> operands;
   for (auto&& [operand_number, operand_indices] :
        llvm::enumerate(input_indices)) {
@@ -1039,12 +1226,9 @@ void GetLoopBoundsFromIndexingMap(ImplicitLocOpBuilder& b,
                                   SmallVectorImpl<Value>* steps) {
   Value c1 = b.create<ConstantIndexOp>(1);
 
-  for (const Interval& range : indexing_map.GetSymbolRanges()) {
-    lbs->push_back(b.create<ConstantIndexOp>(range.lower));
-    ubs->push_back(b.create<ConstantIndexOp>(range.upper + 1));
-    // Note that this is not optimal, when there are mod constraints on symbols,
-    // e.g. for reduce-window. In that case we have to extract loop steps from
-    // the mod constraints.
+  for (const Interval& bound : indexing_map.GetSymbolBounds()) {
+    lbs->push_back(b.create<ConstantIndexOp>(bound.lower));
+    ubs->push_back(b.create<ConstantIndexOp>(bound.upper + 1));
     steps->push_back(c1);
   }
 }
@@ -1088,9 +1272,10 @@ absl::Status SubgraphToMlirFunction(
 SmallVector<Value> EmitLoopNest(
     ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
     const IndexingMap& indexing_map,
-    const std::function<
-        SmallVector<Value>(ValueRange /*iter_args*/, ValueRange /*dim_values*/,
-                           ValueRange /*symbol_values*/)>& create_body) {
+    mlir::function_ref<SmallVector<Value>(ValueRange /*iter_args*/,
+                                          ValueRange /*dim_values*/,
+                                          ValueRange /*symbol_values*/)>
+        create_body) {
   SmallVector<Value, 4> lbs, ubs, steps;
   GetLoopBoundsFromIndexingMap(b, indexing_map, &lbs, &ubs, &steps);
 
@@ -1120,6 +1305,34 @@ SmallVector<Value> EmitLoopNest(
   return loop_nest.results;
 }
 
+absl::StatusOr<SmallVector<Value>> EmitLoopNestWithStatus(
+    ImplicitLocOpBuilder& b, ValueRange dim_values, ValueRange iter_args_inits,
+    const IndexingMap& indexing_map,
+    mlir::function_ref<absl::StatusOr<SmallVector<Value>>(
+        ValueRange /*iter_args*/, ValueRange /*dim_values*/,
+        ValueRange /*symbol_values*/)>
+        create_body) {
+  absl::Status status = absl::OkStatus();
+
+  auto result = EmitLoopNest(
+      b, dim_values, iter_args_inits, indexing_map,
+      [&](ValueRange iter_args, ValueRange dim_values,
+          ValueRange symbol_values) -> SmallVector<Value> {
+        auto body_result = create_body(iter_args, dim_values, symbol_values);
+        if (!body_result.ok()) {
+          status = std::move(body_result.status());
+          return SmallVector<Value>{};
+        }
+
+        return std::move(body_result.value());
+      });
+
+  if (!status.ok()) {
+    return status;
+  }
+  return result;
+}
+
 mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,
                        ImplicitLocOpBuilder& b) {
   auto zero = b.create<ConstantOp>(b.getIndexAttr(0));
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
index 186c83765249ee..85d48087a0e983 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
@@ -97,9 +98,20 @@ mlir::Value CheckConstraints(const IndexingMap& map, mlir::ValueRange dims,
 llvm::SmallVector<mlir::Value> EmitLoopNest(
     mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
     mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
-    const std::function<llvm::SmallVector<mlir::Value>(
+    mlir::function_ref<llvm::SmallVector<mlir::Value>(
         mlir::ValueRange iter_args, mlir::ValueRange dim_values,
-        mlir::ValueRange symbol_values)>& create_body);
+        mlir::ValueRange symbol_values)>
+        create_body);
+
+// Same as EmitLoopNest, but the body building function can return an error
+// which gets returned from EmitLoopNestWithStatus.
+absl::StatusOr<llvm::SmallVector<mlir::Value>> EmitLoopNestWithStatus(
+    mlir::ImplicitLocOpBuilder& b, mlir::ValueRange dim_values,
+    mlir::ValueRange iter_args_inits, const IndexingMap& indexing_map,
+    mlir::function_ref<absl::StatusOr<llvm::SmallVector<mlir::Value>>(
+        mlir::ValueRange iter_args, mlir::ValueRange dim_values,
+        mlir::ValueRange symbol_values)>
+        create_body);
 
 // Clamps `index` to [0, high] boundaries.
 mlir::Value ClampIndex(mlir::Value index, bool is_unsigned, int64_t high,
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
index cd07c607d11f93..1acdff315457e4 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "llvm/Support/raw_ostream.h"
@@ -199,6 +198,94 @@ TEST_F(ElementalHloToMlirTest, ReduceUnsigned) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, ReduceWindow) {
+  TF_EXPECT_OK(Run(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT sum = f32[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f32[42,12,8] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[42,3,8] reduce-window(p0, p1), window={
+                                                size=1x1x7
+                                                stride=1x4x1
+                                                pad=0_0x0_0x3_3
+                                               },
+                                               to_apply=add
+    })",
+                   R"(
+    // CHECK:      @main_r(
+    // CHECK-SAME:   %[[ARG0:.*]]: tensor<42x12x8xf32>
+    // CHECK-SAME:   %[[ARG1:.*]]: tensor<f32>
+    // CHECK-SAME:   %[[X:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Y:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Z:arg[0-9]*]]: index {{[^}]*}}}) -> f32
+    // CHECK-DAG:  %[[C10:.*]] = arith.constant 10
+    // CHECK-DAG:  %[[C0:.*]] = arith.constant 0
+    // CHECK-DAG:  %[[C1:.*]] = arith.constant 1
+    // CHECK-DAG:  %[[C7:.*]] = arith.constant 7
+    // CHECK:      %[[INIT:.*]] = tensor.extract %[[ARG1]][]
+    // CHECK:      %[[RET:.*]] = scf.for %[[I:.*]] = %[[C0]] to %[[C7]]
+    // CHECK-SAME:   step %[[C1]] iter_args(%[[ACC:.*]] = %[[INIT]])
+    // CHECK:      %[[J:.*]] = affine.apply affine_map<()[s0] ->
+    // CHECK-SAME: (s0 * 4)>()[%[[Y]]]
+    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
+    // CHECK-SAME: (s0 + s1 - 3)>()[%[[I]], %[[Z]]]
+    // CHECK:          %[[VAL:.*]] = tensor.extract %[[ARG0]]
+    // CHECK-SAME:        [%[[X]], %[[J]], %[[K]]]
+    // CHECK:          %[[UPD:.*]] = func.call @add_sum(%[[ACC]],
+    // CHECK-SAME:                                      %[[VAL]])
+    // CHECK:          scf.yield %[[UPD]]
+    // CHECK:        }
+    // CHECK:      }
+    // CHECK:      return %[[RET]]
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, ReduceWindowWithRescaling) {
+  TF_EXPECT_OK(Run(R"(
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT sum = f32[] add(p0, p1)
+    }
+
+    ENTRY main {
+      p0 = f32[42,12,8] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[19,12,8] reduce-window(p0, p1), window={
+                                                size=8x1x1
+                                                stride=4x1x1
+                                                pad=0_0x0_0x0_0
+                                                lhs_dilate=2x1x1
+                                               },
+                                               to_apply=add
+    })",
+                   R"(
+    // CHECK:      @main_r(
+    // CHECK-SAME:   %[[ARG0:.*]]: tensor<42x12x8xf32>
+    // CHECK-SAME:   %[[ARG1:.*]]: tensor<f32>
+    // CHECK-SAME:   %[[X:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Y:arg[0-9]*]]: index {{[^}]*}}},
+    // CHECK-SAME:   %[[Z:arg[0-9]*]]: index {{[^}]*}}}) -> f32
+    // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
+
+    // We have a window size of 8, but expect a loop from 0 to 4
+    // due to the base dilation of 2 and the applied symbol rescaling:
+    // CHECK:      scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK:      %[[K:.*]] = affine.apply affine_map<()[s0, s1] ->
+    // If symbol rescaling wasn't working we would have a
+    // `s0 floordiv <base_dilation>` in the map:
+    // CHECK-SAME: (s0 + s1 * 2)>()[%[[I]], %[[X]]]
+    // CHECK:      tensor.extract %[[ARG0]][%[[K]], %[[Y]], %[[Z]]]
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Concatenate) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
@@ -388,6 +475,289 @@ TEST_F(ElementalHloToMlirTest, PadUnsigned) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, DotWithF32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[3, 4] parameter(0)
+      p1 = f32[4, 5] parameter(1)
+      ROOT dot = f32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xf32>, %[[B:.*]]: tensor<4x5xf32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (f32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xf32>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xf32>
+    // CHECK-DAG:        %[[MULF0:.*]] = arith.mulf %[[A_I_K]], %[[B_K_J]] : f32
+    // CHECK-DAG:        %[[ADDF0:.*]] = arith.addf %[[ACCUM]], %[[MULF0]] : f32
+    // CHECK-DAG:        scf.yield %[[ADDF0]] : f32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : f32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : f32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithBF16Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = bf16[3, 4] parameter(0)
+      p1 = bf16[4, 5] parameter(1)
+      ROOT dot = bf16[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xbf16>, %[[B:.*]]: tensor<4x5xbf16>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> bf16
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (f32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xbf16>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xbf16>
+    // CHECK-DAG:        %[[A_I_K_F32:.*]] = arith.extf %[[A_I_K]] :  bf16 to f32
+    // CHECK-DAG:        %[[B_K_J_F32:.*]] = arith.extf %[[B_K_J]] :  bf16 to f32
+    // CHECK-DAG:        %[[MULF0:.*]] = arith.mulf %[[A_I_K_F32]], %[[B_K_J_F32]] : f32
+    // CHECK-DAG:        %[[ADDF0:.*]] = arith.addf %[[ACCUM]], %[[MULF0]] : f32
+    // CHECK-DAG:        scf.yield %[[ADDF0]] : f32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : f32
+    // CHECK:        }
+    // CHECK:        %[[FOR0_BF16:.*]] = arith.truncf %[[FOR0]] : f32 to bf16
+    // CHECK:        return %[[FOR0_BF16]] : bf16
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithS32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = s32[3, 4] parameter(0)
+      p1 = s32[4, 5] parameter(1)
+      ROOT dot = s32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xi32>, %[[B:.*]]: tensor<4x5xi32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> i32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0 : i32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xi32>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xi32>
+    // CHECK-DAG:        %[[MUL0:.*]] = arith.muli %[[A_I_K]], %[[B_K_J]] : i32
+    // CHECK-DAG:        %[[ADD0:.*]] = arith.addi %[[ACCUM]], %[[MUL0]] : i32
+    // CHECK-DAG:        scf.yield %[[ADD0]] : i32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : i32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithU32Type) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = u32[3, 4] parameter(0)
+      p1 = u32[4, 5] parameter(1)
+      ROOT dot = u32[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xui32>, %[[B:.*]]: tensor<4x5xui32>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> ui32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant 0 : i32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i32) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i32) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xui32>
+    // CHECK-DAG:        %[[A_I_K_I32:.*]] = builtin.unrealized_conversion_cast %[[A_I_K]] : ui32 to i32
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xui32>
+    // CHECK-DAG:        %[[B_K_J_I32:.*]] = builtin.unrealized_conversion_cast %[[B_K_J]] : ui32 to i32
+    // CHECK-DAG:        %[[MUL0:.*]] = arith.muli %[[A_I_K_I32]], %[[B_K_J_I32]] : i32
+    // CHECK-DAG:        %[[ADD0:.*]] = arith.addi %[[ACCUM]], %[[MUL0]] : i32
+    // CHECK-DAG:        scf.yield %[[ADD0]] : i32
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i32
+    // CHECK:        }
+    // CHECK:        %[[FOR0_UI32:.*]] = builtin.unrealized_conversion_cast %[[FOR0]] : i32 to ui32
+    // CHECK:        return %[[FOR0_UI32]] : ui32
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithPredType) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = pred[3, 4] parameter(0)
+      p1 = pred[4, 5] parameter(1)
+      ROOT dot = pred[3, 5] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<3x4xi1>, %[[B:.*]]: tensor<4x5xi1>,
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 4 : index]})
+    // CHECK-SAME: -> i1
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[ACCUM_INIT:.*]] = arith.constant false
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM:.*]] = %[[ACCUM_INIT]]) -> (i1) {
+    // CHECK-DAG:      %[[CMPI0:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI1:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:      %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:      %[[CMPI2:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:      %[[CMPI3:.*]] = arith.cmpi sle, %[[J]], %[[C4]] : index
+    // CHECK-DAG:      %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:      %[[I_J_IN_RANGE:.*]] = arith.andi %[[I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:          %[[IF0:.*]] = scf.if %[[I_J_IN_RANGE]] -> (i1) {
+    // CHECK-DAG:        %[[A_I_K:.*]] = tensor.extract %[[A]][%[[I]], %[[K]]] : tensor<3x4xi1>
+    // CHECK-DAG:        %[[B_K_J:.*]] = tensor.extract %[[B]][%[[K]], %[[J]]] : tensor<4x5xi1>
+    // CHECK-DAG:        %[[AND0:.*]] = arith.andi %[[A_I_K]], %[[B_K_J]] : i1
+    // CHECK-DAG:        %[[OR0:.*]] = arith.ori %[[ACCUM]], %[[AND0]] : i1
+    // CHECK-DAG:        scf.yield %[[OR0]] : i1
+    // CHECK:          } else {
+    // CHECK:            scf.yield %[[ACCUM]] : i1
+    // CHECK:          }
+    // CHECK:          scf.yield %[[IF0]] : i1
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : i1
+    // CHECK:      }
+  )"));
+}
+
+TEST_F(ElementalHloToMlirTest, DotWithBatchAnd2ContractingDims) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      p0 = f32[7, 3, 4, 5] parameter(0)
+      p1 = f32[5, 6, 4, 7] parameter(1)
+      ROOT dot = f32[7, 3, 6] dot(p0, p1),
+                 lhs_contracting_dims={2, 3}, rhs_contracting_dims={2, 0},
+                 lhs_batch_dims={0}, rhs_batch_dims={3}
+    })",
+                   R"(
+    // CHECK:      @main_dot(
+    // CHECK-SAME: %[[A:.*]]: tensor<7x3x4x5xf32>, %[[B:.*]]: tensor<5x6x4x7xf32>,
+    // CHECK-SAME: %[[N:.*]]: index {xla.range = [0 : index, 6 : index]},
+    // CHECK-SAME: %[[I:.*]]: index {xla.range = [0 : index, 2 : index]},
+    // CHECK-SAME: %[[J:.*]]: index {xla.range = [0 : index, 5 : index]})
+    // CHECK-SAME: -> f32
+    // CHECK-SAME: {
+    // CHECK-DAG:    %[[C0F:.*]] = arith.constant 0.000000e+00 : f32
+    // CHECK-DAG:    %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG:    %[[C1:.*]] = arith.constant 1 : index
+    // CHECK-DAG:    %[[C2:.*]] = arith.constant 2 : index
+    // CHECK-DAG:    %[[C4:.*]] = arith.constant 4 : index
+    // CHECK-DAG:    %[[C5:.*]] = arith.constant 5 : index
+    // CHECK-DAG:    %[[C6:.*]] = arith.constant 6 : index
+    // CHECK:        %[[FOR0:.*]] = scf.for %[[K:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+    // CHECK-SAME:   iter_args(%[[ACCUM0:.*]] = %[[C0F]]) -> (f32) {
+    // CHECK:          %[[FOR1:.*]] = scf.for %[[L:.*]] = %[[C0]] to %[[C5]] step %[[C1]]
+    // CHECK-SAME:     iter_args(%[[ACCUM1:.*]] = %[[ACCUM0]]) -> (f32) {
+    // CHECK-DAG:        %[[CMPI0:.*]] = arith.cmpi sge, %[[N]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI1:.*]] = arith.cmpi sle, %[[N]], %[[C6]] : index
+    // CHECK-DAG:        %[[N_IN_RANGE:.*]] = arith.andi %[[CMPI0]], %[[CMPI1]] : i1
+    // CHECK-DAG:        %[[CMPI2:.*]] = arith.cmpi sge, %[[I]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI3:.*]] = arith.cmpi sle, %[[I]], %[[C2]] : index
+    // CHECK-DAG:        %[[I_IN_RANGE:.*]] = arith.andi %[[CMPI2]], %[[CMPI3]] : i1
+    // CHECK-DAG:        %[[N_I_IN_RANGE:.*]] = arith.andi %[[N_IN_RANGE]], %[[I_IN_RANGE]] : i1
+    // CHECK-DAG:        %[[CMPI4:.*]] = arith.cmpi sge, %[[J]], %[[C0]] : index
+    // CHECK-DAG:        %[[CMPI5:.*]] = arith.cmpi sle, %[[J]], %[[C5]] : index
+    // CHECK-DAG:        %[[J_IN_RANGE:.*]] = arith.andi %[[CMPI4]], %[[CMPI5]] : i1
+    // CHECK-DAG:        %[[N_I_J_IN_RANGE:.*]] = arith.andi %[[N_I_IN_RANGE]], %[[J_IN_RANGE]] : i1
+    // CHECK:            %[[IF0:.*]] = scf.if %[[N_I_J_IN_RANGE]] -> (f32) {
+    // CHECK-DAG:          %[[A_N_I_K_L:.*]] = tensor.extract %[[A]][%[[N]], %[[I]], %[[K]], %[[L]]] : tensor<7x3x4x5xf32>
+    // CHECK-DAG:          %[[B_L_J_K_N:.*]] = tensor.extract %[[B]][%[[L]], %[[J]], %[[K]], %[[N]]] : tensor<5x6x4x7xf32>
+    // CHECK-DAG:          %[[MULF0:.*]] = arith.mulf %[[A_N_I_K_L]], %[[B_L_J_K_N]] : f32
+    // CHECK-DAG:          %[[ADDF0:.*]] = arith.addf %[[ACCUM1]], %[[MULF0]] : f32
+    // CHECK-DAG:          scf.yield %[[ADDF0]] : f32
+    // CHECK:            } else {
+    // CHECK:              scf.yield %[[ACCUM1]] : f32
+    // CHECK:            }
+    // CHECK:            scf.yield %[[IF0]] : f32
+    // CHECK:          }
+    // CHECK:          scf.yield %[[FOR1]] : f32
+    // CHECK:        }
+    // CHECK:        return %[[FOR0]] : f32
+    // CHECK:      }
+  )"));
+}
+
 TEST_F(ElementalHloToMlirTest, Transpose) {
   TF_EXPECT_OK(Run(R"(
     ENTRY main {
@@ -728,6 +1098,27 @@ TEST_F(ElementalHloToMlirTest, IotaComplex) {
   )"));
 }
 
+TEST_F(ElementalHloToMlirTest, MixedIndexingTuple) {
+  TF_EXPECT_OK(Run(R"(
+    ENTRY main {
+      %p0 = f32[10,10] parameter(0)
+      %p1 = f32[100] parameter(1)
+      ROOT tuple = (f32[10,10], f32[100]) tuple(%p0, %p1)
+    })",
+                   R"(
+    // CHECK:      @main_tuple(
+    // CHECK-SAME:     %[[P0:.*]]: tensor<10x10xf32>,
+    // CHECK-SAME:     %[[P1:.*]]: tensor<100xf32>,
+    // CHECK-SAME:     %[[X:.*]]: index {{{.*}}}, %[[Y:.*]]: index {{{.*}}}
+    // CHECK:        %[[A:.*]] = tensor.extract %[[P0]][%[[X]], %[[Y]]]
+    // CHECK:        %[[IDX:.*]] = affine.apply
+    // CHECK-SAME:       affine_map<()[s0, s1] -> (s0 * 10 + s1)>()
+    // CHECK-SAME:       [%[[X]], %[[Y]]]
+    // CHECK:        %[[B:.*]] = tensor.extract %[[P1]][%[[IDX]]]
+    // CHECK:        return %[[A]], %[[B]]
+  )"));
+}
+
 }  // namespace
 }  // namespace mlir_converter
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
index cd2f620c0266a1..e82df33e965214 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/ir/BUILD
@@ -58,9 +58,13 @@ cc_library(
     deps = [
         ":xla_gpu_ops_inc_gen",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BytecodeOpInterface",
+        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:InliningUtils",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
index c86fbda41c4cf1..e4020c97a57e7d 100644
--- a/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
+++ b/third_party/xla/xla/service/gpu/fusions/mlir/simplify_affine.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "xla/service/gpu/fusions/mlir/passes.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 
 namespace xla {
@@ -99,26 +100,28 @@ struct RewriteAffineApply
       mlir::affine::AffineApplyOp op,
       mlir::PatternRewriter& rewriter) const override {
     auto affine_map = op.getAffineMap();
-    std::vector<Interval> dim_ranges(affine_map.getNumDims());
-    std::vector<Interval> symbol_ranges(affine_map.getNumSymbols());
+    std::vector<DimVar> dim_ranges(affine_map.getNumDims());
+    std::vector<RangeVar> symbol_ranges(affine_map.getNumSymbols());
 
     for (int i = 0; i < affine_map.getNumInputs(); ++i) {
       if (auto range = GetRange(op->getOperand(i))) {
         if (i >= dim_ranges.size()) {
-          symbol_ranges[i - dim_ranges.size()] = *range;
+          symbol_ranges[i - dim_ranges.size()] = RangeVar{*range};
         } else {
-          dim_ranges[i] = *range;
+          dim_ranges[i] = DimVar{*range};
         }
       } else {
         return rewriter.notifyMatchFailure(op, "failed to deduce range");
       }
     }
 
-    IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges);
-    map.Simplify();
+    IndexingMap map(op.getAffineMap(), dim_ranges, symbol_ranges,
+                    /*rt_vars=*/{});
+    map.Simplify(GetIndexingMapForInstruction);
     auto expr = map.GetAffineMap().getResult(0);
 
-    RangeEvaluator range_evaluator(dim_ranges, symbol_ranges, op->getContext());
+    RangeEvaluator range_evaluator(map.GetDimensionBounds(),
+                                   map.GetSymbolBounds(), op->getContext());
     std::function<bool(mlir::AffineExpr)> can_be_lowered;
     bool fits_32_bits = true;
     can_be_lowered = [&](mlir::AffineExpr expr) {
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index f861fa7bfb932e..193fc36d20266c 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -62,8 +62,8 @@ limitations under the License.
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/target_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/kernel_support_library.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
index 57db8735cc0e60..86bb721129d009 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_base.cc
@@ -332,12 +332,12 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
 
   auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
                                                     hero->operand(0)->shape());
-  std::vector<Interval> dimension_ranges{
-      {0, tiling_.GetNumThreadsPerBlock() - 1},
+  std::vector<DimVar> dimension_ranges{
+      {{0, tiling_.GetNumThreadsPerBlock() - 1}},
       {},
       {},
-      {0, tiling_.GetNumBlocks() - 1},
-      {0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)},
+      {{0, tiling_.GetNumBlocks() - 1}},
+      {{0, static_cast<int64_t>(groups_.grouped_roots.size() - 1)}},
       {},
   };
 
@@ -355,7 +355,7 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
           mlir::AffineMap::get(
               6, 0, block_offsets.getResult(kRowKept) + thread_ids[kRowKept],
               ctx),
-          dimension_ranges, {});
+          dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
       int rows_per_warp = GetRowsPerWarp();
       if (rows_per_warp > 1) {
         linear_index.AddConstraint(
@@ -376,7 +376,7 @@ std::optional<IndexingMap> ReductionInfo::ComputeThreadIdToOutputIndexing(
             {block_offsets.getResult(kColMajorKept),
              block_offsets.getResult(kColMinorKept) + thread_ids[kColReduced]},
             ctx),
-        dimension_ranges, {});
+        dimension_ranges, /*range_vars=*/{}, /*rt_vars=*/{});
 
     projected_index.AddConstraint(
         mlir::getAffineDimExpr(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.cc b/third_party/xla/xla/service/gpu/fusions/scatter.cc
index c784d08bf794e9..0625f9efd4653b 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.cc
@@ -32,10 +32,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
+#include "xla/service/gpu/fusions/loop.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -47,9 +49,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+ScatterFusion::ScatterFusion(const HloFusionAnalysis& analysis)
+    : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {
+  CHECK_EQ(analysis.fusion_roots().size(), 1);
+  CHECK_EQ(analysis.fusion_roots()[0]->opcode(), HloOpcode::kScatter);
+}
+
 LaunchDimensions ScatterFusion::launch_dimensions() const {
   const auto& updates_shape =
-      analysis_.fusion_roots().front()->operand(2)->shape();
+      analysis_.fusion_roots().front()->operands().back()->shape();
   return CalculateLaunchDimensions(updates_shape, analysis_.device_info());
 }
 
@@ -232,5 +240,47 @@ absl::Status ScatterFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
       .EmitLoop(name, index_type);
 }
 
+std::optional<IndexingMap> ScatterFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  auto* scatter =
+      DynCast<HloScatterInstruction>(analysis_.fusion_heroes().front());
+  int64_t scatter_operand_count = scatter->scatter_operand_count();
+  // Scatter operands a packed in the following way:
+  // Operand IDs [0, scatter_operand_count - 1] for `scatter operands`.
+  // Operand ID  scatter_operand_count for `scatter indices`.
+  // Operand IDs [scatter_operand_count + 1, 2 * scatter_operand_count] for
+  // `scatter updates`.
+
+  // For scatter operands we do not know the thread ID indexing.
+  if (hero_operand_index < scatter_operand_count) {
+    return std::nullopt;
+  }
+  // Compute thread id mapping based on the first update operand.
+  Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
+  IndexingMap scatter_update_map = GetDefaultThreadIdIndexingMap(
+      launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
+
+  // For scatter indices we project indexing for scatter updates and take the
+  // first result of the affine map only, because they coincide.
+  if (hero_operand_index == scatter_operand_count) {
+    Shape scatter_indices_shape = scatter->scatter_indices()->shape();
+    CHECK_EQ(scatter_indices_shape.rank(), 2) << scatter->ToString();
+    // Create a map from scatter update to scatter indices.
+    IndexingMap updates_to_indices_map{
+        mlir::AffineMap::get(
+            /*dimCount=*/scatter_update_shape.rank(), /*symbolCount=*/1,
+            {mlir::getAffineDimExpr(0, ctx), mlir::getAffineSymbolExpr(0, ctx)},
+            ctx),
+        DimVarsFromTensorSizes(scatter_update_shape.dimensions()),
+        RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
+        /*rt_vars=*/{}};
+    auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
+    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
+    return scatter_indices_map;
+  }
+  return scatter_update_map;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index 6982bbc8e6bd2c..289328fd7a7c14 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -35,11 +35,7 @@ namespace gpu {
 // A scatter, implemented as a loop over the updates. All scatters are in-place.
 class ScatterFusion : public KernelFusionEmitterBase {
  public:
-  explicit ScatterFusion(const HloFusionAnalysis& analysis)
-      : analysis_(analysis) {
-    CHECK_EQ(analysis.fusion_roots().size(), 1);
-    CHECK_EQ(analysis.fusion_roots()[0]->opcode(), HloOpcode::kScatter);
-  }
+  explicit ScatterFusion(const HloFusionAnalysis& analysis);
 
   LaunchDimensions launch_dimensions() const override;
 
@@ -52,10 +48,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
@@ -67,6 +60,7 @@ class ScatterFusion : public KernelFusionEmitterBase {
 
  private:
   const HloFusionAnalysis& analysis_;
+  LaunchDimensionsConfig config_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
index 85242c0740e7b6..6d8969a5251f31 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/fusions/mlir/ir/xla_gpu_ops.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
 #include "xla/shape.h"
 #include "xla/xla_data.pb.h"
@@ -105,7 +106,7 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
   }
   // Compute thread id mapping based on the first update operand.
   Shape scatter_update_shape = scatter->scatter_updates().front()->shape();
-  IndexingMap scatter_update_map = GetDefaultThreadIdToOutputIndexingMap(
+  IndexingMap scatter_update_map = GetDefaultThreadIdIndexingMap(
       launch_dimensions(), config_.unroll_factor, scatter_update_shape, ctx);
 
   // For scatter indices we project indexing for scatter updates and take the
@@ -119,11 +120,11 @@ std::optional<IndexingMap> MlirScatterFusion::ComputeThreadIdToInputIndexing(
             /*dimCount=*/scatter_update_shape.rank(), /*symbolCount=*/1,
             {mlir::getAffineDimExpr(0, ctx), mlir::getAffineSymbolExpr(0, ctx)},
             ctx),
-        /*dim_ranges=*/RangesFromTensorSizes(scatter_update_shape.dimensions()),
-        /*symbol_ranges=*/
-        RangesFromTensorSizes({scatter_indices_shape.dimensions(1)})};
+        DimVarsFromTensorSizes(scatter_update_shape.dimensions()),
+        RangeVarsFromTensorSizes({scatter_indices_shape.dimensions(1)}),
+        /*rt_vars=*/{}};
     auto scatter_indices_map = scatter_update_map * updates_to_indices_map;
-    scatter_indices_map.Simplify();
+    scatter_indices_map.Simplify(GetIndexingMapForInstruction);
     return scatter_indices_map;
   }
   return scatter_update_map;
@@ -190,7 +191,7 @@ absl::Status MlirScatterFusion::EmitEntryFunction(
           /*root_index=*/0, /*hero_operand_index=*/kScatterUpdateIndex,
           mlir_context)
           .value();
-  thread_id_to_update_map.Simplify();
+  thread_id_to_update_map.Simplify(GetIndexingMapForInstruction);
   thread_id_to_update_map.RemoveUnusedSymbols();
 
   const auto& root_computation = computations.FindPartitionedComputation(
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
index f7fdba3b97db30..12fca854ae5fb7 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_mlir_test.cc
@@ -28,7 +28,7 @@ namespace {
 
 using MlirScatterFusionTest = MlirEmitterTestBase<MlirScatterFusion>;
 
-TEST_F(MlirScatterFusionTest, ThreadId_IndexingUnrolled) {
+TEST_F(MlirScatterFusionTest, ThreadIdIndexing) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
index e000c05caf9938..2be8dc86d75540 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/statusor.h"
@@ -29,7 +31,19 @@ namespace xla {
 namespace gpu {
 namespace {
 
-class ScatterFusionTest : public HloTestBase {};
+class ScatterFusionTest : public HloTestBase {
+ public:
+  void SetUp() override {
+    HloTestBase::SetUp();
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id", "index_id"});
+  }
+
+ protected:
+  AffineMapPrinter printer_;
+  mlir::MLIRContext mlir_context_;
+};
 
 TEST_F(ScatterFusionTest, ScatterFusion) {
   auto module = ParseAndReturnVerifiedModule(R"(
@@ -76,6 +90,130 @@ TEST_F(ScatterFusionTest, ScatterFusion) {
             3 * 9 /* updates size */);
 }
 
+TEST_F(ScatterFusionTest, ThreadIdIndexing) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    computation {
+      %p0 = f32[] parameter(0)
+      %p1 = f32[] parameter(1)
+      %p2 = f32[] parameter(2)
+      %p3 = f32[] parameter(3)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %p2, f32[] %p3)
+    }
+    scatter {
+      %operand0 = f32[300,200] parameter(0)
+      %operand1 = f32[300,200] parameter(1)
+      %indices = s32[42,1] parameter(2)
+      %update.1 = f32[42,10,20] parameter(3)
+      %update.2 = f32[42,10,20]parameter(4)
+
+      ROOT %scatter = (f32[300,200], f32[300,200]) scatter(
+          f32[300,200] %operand0,
+          f32[300,200] %operand1,
+          s32[42,1] %indices,
+          f32[42,10,20] %update.1,
+          f32[42,10,20] %update.2
+        ),
+        update_window_dims={1,2},
+        inserted_window_dims={},
+        scatter_dims_to_operand_dims={0},
+        index_vector_dim=1,
+        to_apply=computation
+    }
+    ENTRY entry {
+      %operand0 = f32[300,200] parameter(0)
+      %operand1 = f32[300,200] parameter(1)
+      %indices = s32[42,1] parameter(2)
+      %update.1 = f32[42,10,20] parameter(3)
+      %update.2 = f32[42,10,20]parameter(4)
+      ROOT %fusion = (f32[300,200], f32[300,200]) fusion(
+        %operand0, %operand1, %indices, %update.1, %update.2),
+        kind=kLoop, calls=scatter
+    }
+  )"));
+  stream_executor::DeviceDescription device_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis_fused = AnalyzeFusion(*root, device_info);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto emitter,
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
+  auto fusion = dynamic_cast<ScatterFusion*>(emitter.get());
+  ASSERT_NE(fusion, nullptr);
+
+  constexpr auto kUpdatesIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+    ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42,
+    ((bl_x * 32 + th_x floordiv 4) floordiv 5) mod 10,
+    (th_x + bl_x * 128) mod 20)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 65]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 8399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/3, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/4, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/3, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/4, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kUpdatesIndexing));
+
+  constexpr auto kIndicesIndexing = R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id, index_id] -> (
+    ((bl_x * 16 + th_x floordiv 8) floordiv 25) mod 42, 0)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 65]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    index_id in [0, 0]
+    th_x + bl_x * 128 in [0, 8399]
+  )";
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/0, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndicesIndexing));
+  EXPECT_THAT(
+      fusion
+          ->ComputeThreadIdToInputIndexing(
+              /*root_index=*/1, /*hero_operand_index=*/2, &mlir_context_)
+          ->ToString(printer_),
+      MatchIndexingString(kIndicesIndexing));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
index 36b5cb300453bd..b356f76d083f8a 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/runtime/memset_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.h b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
index 32ba8e6a267a8c..a78bb76f3cdd2e 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.h
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 99f113cbafbea7..ca7b3f7ff79228 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -306,7 +306,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
           permuted_tiled_shape.dimensions()),
       GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
@@ -318,7 +318,7 @@ std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
   auto map = ComposeIndexingMaps(
       GetIndexingMapForTiling(tiling_, ctx),
       GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
-  map.Simplify();
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
index 9311097e0093f8..9e2e5be2ea564b 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.cc
@@ -111,17 +111,16 @@ Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
 }
 
 // Returns transpose heroes that should be codegened via shmem.
-absl::flat_hash_set<const HloInstruction*> GetShMemTranposes(
+std::vector<const HloInstruction*> GetShMemTransposes(
     const HloFusionAnalysis& analysis) {
-  absl::flat_hash_set<const HloInstruction*> tranposes_to_tile;
+  ConstHloInstructionSet transposes_to_tile;
   for (const auto [hero, root] :
        llvm::zip(analysis.fusion_heroes(), analysis.fusion_roots())) {
-    if (!GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      continue;
+    if (GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
+      transposes_to_tile.insert(hero);
     }
-    tranposes_to_tile.insert(hero);
   }
-  return tranposes_to_tile;
+  return {transposes_to_tile.begin(), transposes_to_tile.end()};
 }
 
 }  // namespace
@@ -129,7 +128,7 @@ absl::flat_hash_set<const HloInstruction*> GetShMemTranposes(
 MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis),
       tiling_(ComputeTransposeTiling(analysis.tiled_transpose())),
-      shmem_transposes_(GetShMemTranposes(analysis)) {
+      shmem_transposes_(GetShMemTransposes(analysis)) {
   for (auto [root, hero] :
        llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
     if (auto transpose = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
@@ -139,34 +138,14 @@ MlirTransposeFusion::MlirTransposeFusion(const HloFusionAnalysis& analysis)
   }
 }
 
-/*static*/ bool MlirTransposeFusion::IsSupported(
-    const HloFusionAnalysis& analysis) {
-  // If there is a hero, which does not have a transpose, the codegen might
-  // fail because of the incorrect thread ID mapping for that particular case.
-  for (const auto [hero, root] :
-       llvm::zip(analysis.fusion_heroes(), analysis.fusion_roots())) {
-    if (!GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
-      return false;
-    }
-  }
-  return mlir_converter::IsHloConversionSupported(
-      analysis.fusion(), analysis.device_info().gpu_compute_capability());
-}
-
 std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
-    int64_t root_index, MLIRContext* ctx) const {
+    int64_t root_index, MLIRContext* mlir_context) const {
   const auto& hero = *analysis_.fusion_heroes()[root_index];
-  const auto& root = *analysis_.fusion_roots()[root_index];
-  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
-    // Non-transpose roots are elementwise by definition.
-    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
-  }
-
   // The block offsets are permuted, but the thread offsets remain the same.
-  auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
+  auto block_offset = GetBlockOffsetsForTiling(tiling_, mlir_context)
                           .getSubMap(std::vector<unsigned>{permutation_.begin(),
                                                            permutation_.end()});
-  auto thread_offset = GetThreadOffsetsForTiling(tiling_, ctx);
+  auto thread_offset = GetThreadOffsetsForTiling(tiling_, mlir_context);
   auto permuted_tiled_shape =
       ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
 
@@ -175,22 +154,34 @@ std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToOutputIndexing(
           block_offset, thread_offset, tiling_.GetNumThreadsPerBlock(),
           tiling_.GetNumBlocks(), tiling_.GetThreadTileSize(),
           permuted_tiled_shape.dimensions()),
-      GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
-  map.Simplify();
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), mlir_context));
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
-std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
-    int64_t root_index, int64_t hero_operand_index, MLIRContext* ctx) const {
-  const auto& hero = *analysis_.fusion_heroes()[root_index];
-
+IndexingMap MlirTransposeFusion::ComputeThreadIdToInputIndexing(
+    const HloInstruction& hero, MLIRContext* mlir_context) const {
   auto map = ComposeIndexingMaps(
-      GetIndexingMapForTiling(tiling_, ctx),
-      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
-  map.Simplify();
+      GetIndexingMapForTiling(tiling_, mlir_context),
+      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(),
+                    mlir_context));
+  map.Simplify(GetIndexingMapForInstruction);
   return map;
 }
 
+std::optional<IndexingMap> MlirTransposeFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    MLIRContext* mlir_context) const {
+  const auto& hero = *analysis_.fusion_heroes()[root_index];
+  const auto& root = *analysis_.fusion_roots()[root_index];
+  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
+    // Non-transpose roots are elementwise by definition.
+    return ComputeThreadIdToOutputIndexing(root_index, mlir_context);
+  }
+  return ComputeThreadIdToInputIndexing(*analysis_.fusion_heroes()[root_index],
+                                        mlir_context);
+}
+
 LaunchDimensions MlirTransposeFusion::launch_dimensions() const {
   return LaunchDimensions(tiling_.GetNumBlocks(),
                           tiling_.GetNumThreadsPerBlock());
@@ -212,10 +203,11 @@ IndexingMap GetSharedMemoryWriteIndexingMap(
           thread_id_indexing.GetSymbolCount(),
           {c0, th_x.floorDiv(32) + 4 * tile_sizes[loop_dim], th_x % 32},
           mlir_context),
-      thread_id_indexing.GetDimensionRanges(),
-      thread_id_indexing.GetSymbolRanges(),
+      thread_id_indexing.GetDimVars(),
+      thread_id_indexing.GetRangeVars(),
+      thread_id_indexing.GetRTVars(),
       thread_id_indexing.GetConstraints()};
-  shmem_write_indexing.Simplify();
+  shmem_write_indexing.Simplify(GetIndexingMapForInstruction);
   return shmem_write_indexing;
 }
 
@@ -226,8 +218,8 @@ IndexingMap GetSharedMemoryReadIndexingMap(
   IndexingMap write_indexing =
       GetSharedMemoryWriteIndexingMap(thread_id_indexing, loop_dim);
   return IndexingMap{write_indexing.GetAffineMap().getSubMap({0, 2, 1}),
-                     write_indexing.GetDimensionRanges(),
-                     write_indexing.GetSymbolRanges(),
+                     write_indexing.GetDimVars(), write_indexing.GetRangeVars(),
+                     write_indexing.GetRTVars(),
                      write_indexing.GetConstraints()};
 }
 
@@ -242,32 +234,13 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
   int num_outputs = entry_function.getArguments().size() - num_inputs;
 
-  SmallPtrSet<const HloInstruction*, 8> emitted_heros;
-
+  MLIRContext* mlir_context = builder.getContext();
   SmallVector<Value> shmem_intermediate_result;
-  for (const auto& [root_index, hero_and_root] : llvm::enumerate(
-           llvm::zip(analysis_.fusion_heroes(), analysis_.fusion_roots()))) {
-    const HloInstruction* transpose = std::get<0>(hero_and_root);
-    const HloInstruction* root = std::get<1>(hero_and_root);
-
-    // The same hero can occure only multiple (hero, root) pair. We should emit
-    // the write to shmem only once.
-    if (!emitted_heros.insert(transpose).second) {
-      continue;
-    }
-
-    // Skip non-transpose heroes and handle them in EmitReadFromShMemMlir.
-    auto description =
-        GetDescriptionForTiledTransposeEmitter(*root, *transpose);
-    if (!description.has_value()) {
-      continue;
-    }
-
-    auto input_indexing = ComputeThreadIdToInputIndexing(
-        root_index, /*hero_operand_index=*/0, builder.getContext());
-    TF_RET_CHECK(input_indexing) << "Indexing is never nullopt";
+  for (auto* transpose : shmem_transposes_) {
+    auto input_indexing =
+        ComputeThreadIdToInputIndexing(*transpose, mlir_context);
     IndexingMap shmem_input_indexing =
-        GetSharedMemoryWriteIndexingMap(*input_indexing, permutation_[2]);
+        GetSharedMemoryWriteIndexingMap(input_indexing, permutation_[2]);
 
     // Allocate shared memory.
     const HloInstruction* transpose_operand = transpose->operand(0);
@@ -278,11 +251,11 @@ absl::StatusOr<SmallVector<Value, 4>> MlirTransposeFusion::EmitWriteToShMemMlir(
 
     // Emit loop that writes subgraphs of transpose operands to shmem.
     auto shmem_result = EmitThreadLoopNest(
-        builder, {shmem}, *input_indexing,
+        builder, {shmem}, input_indexing,
         [&](ValueRange output_tensors, ValueRange dim_values,
             ValueRange symbol_values) -> SmallVector<Value> {
           auto input_indices =
-              ApplyAffineMap(input_indexing->GetAffineMap(), dim_values,
+              ApplyAffineMap(input_indexing.GetAffineMap(), dim_values,
                              symbol_values, builder);
           auto shmem_indices =
               ApplyAffineMap(shmem_input_indexing.GetAffineMap(), dim_values,
@@ -313,115 +286,52 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
     const HloFusionInstruction& fusion,
     const mlir_converter::PartitionedComputations& computations,
     const CallTargetProvider& call_targets, ValueRange shmem_tensors) const {
-  SmallVector<Value, 4> result_tensors;
-
   int num_inputs = fusion.fused_instructions_computation()->num_parameters();
-
-  SmallPtrSet<const HloInstruction*, 16> hero_roots{
-      analysis_.fusion_roots().begin(), analysis_.fusion_roots().end()};
-
-  // Cache for root indexing per hero. If multiple roots use the same hero, they
-  // will have identical indexing.
-  absl::flat_hash_map<const HloInstruction*, IndexingMap> root_to_hero_indexing;
-
-  int transpose_hero_count = 0;
-
-  // Map from hero instruction to shmem tensor value.
-  absl::flat_hash_map<const HloInstruction*, Value> hero_to_shmem_tensor;
-
+  auto* mlir_context = builder.getContext();
   ValueRange output_tensor_args =
       entry_function.getArguments().drop_front(num_inputs);
+  auto output_indexing = *ComputeThreadIdToOutputIndexing(0, mlir_context);
+  auto shmem_output_indexing =
+      GetSharedMemoryReadIndexingMap(output_indexing, permutation_[2]);
+  auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
+      analysis_.fusion_heroes()[0], mlir_context);
+  auto root_indexing = ComposeIndexingMaps(output_indexing, epilogue_indexing);
+  auto result_tensors = EmitThreadLoopNest(
+      builder, output_tensor_args, output_indexing,
+      [&](ValueRange output_tensors, ValueRange dim_values,
+          ValueRange symbol_values) -> SmallVector<Value> {
+        auto shmem_indices =
+            ApplyAffineMap(shmem_output_indexing.GetAffineMap(), dim_values,
+                           symbol_values, builder);
+        llvm::SmallVector<Value> transpose_values;
+        for (auto shmem : shmem_tensors) {
+          transpose_values.push_back(
+              builder.create<ExtractOp>(shmem, shmem_indices));
+        }
+        auto root_indices = ApplyAffineMap(root_indexing.GetAffineMap(),
+                                           dim_values, symbol_values, builder);
+        auto result_scalars =
+            EmitEpilogue(computations, entry_function, transpose_values,
+                         root_indices, builder);
+        SmallVector<Value> results;
+        results.reserve(output_tensor_args.size());
+        const auto& first_shape = analysis_.fusion_roots().front()->shape();
+        for (auto [tensor, value, root] : llvm::zip(
+                 output_tensors, result_scalars, analysis_.fusion_roots())) {
+          llvm::SmallVector<Value> indices;
+          if (ShapeUtil::EqualIgnoringElementType(first_shape, root->shape())) {
+            indices = root_indices;
+          } else {
+            auto bitcast_map =
+                GetBitcastMap(first_shape, root->shape(), mlir_context);
+            indices = ApplyAffineMap(bitcast_map.GetAffineMap(), root_indices,
+                                     {}, builder);
+          }
+          results.push_back(builder.create<InsertOp>(value, tensor, indices));
+        }
+        return results;
+      });
 
-  for (const auto& [root_index, hero_and_root] : llvm::enumerate(
-           llvm::zip(analysis_.fusion_heroes(), analysis_.fusion_roots()))) {
-    const HloInstruction* transpose = std::get<0>(hero_and_root);
-    const HloInstruction* root = std::get<1>(hero_and_root);
-
-    auto* mlir_context = builder.getContext();
-    auto output_indexing =
-        ComputeThreadIdToOutputIndexing(root_index, mlir_context);
-    TF_RET_CHECK(output_indexing) << "Indexing is never nullopt";
-
-    if (!root_to_hero_indexing.contains(transpose)) {
-      auto epilogue_indexing = ComputeEpilogueInputToOutputIndexing(
-          transpose, mlir_context,
-          /*is_root=*/[&](const HloInstruction* instr) {
-            return hero_roots.contains(instr);
-          });
-      root_to_hero_indexing.emplace(
-          transpose, ComposeIndexingMaps(*output_indexing, epilogue_indexing));
-    }
-
-    const IndexingMap& root_indexing = root_to_hero_indexing.at(transpose);
-
-    IndexingMap shmem_output_indexing =
-        GetSharedMemoryReadIndexingMap(*output_indexing, permutation_[2]);
-    auto description =
-        GetDescriptionForTiledTransposeEmitter(*root, *transpose);
-
-    if (description.has_value()) {
-      auto subresult_tensors = EmitThreadLoopNest(
-          builder, output_tensor_args[root_index], *output_indexing,
-          [&](ValueRange output_tensors, ValueRange dim_values,
-              ValueRange symbol_values) -> SmallVector<Value> {
-            auto root_indices =
-                ApplyAffineMap(root_indexing.GetAffineMap(), dim_values,
-                               symbol_values, builder);
-            auto shmem_indices =
-                ApplyAffineMap(shmem_output_indexing.GetAffineMap(), dim_values,
-                               symbol_values, builder);
-
-            if (!hero_to_shmem_tensor.contains(transpose)) {
-              hero_to_shmem_tensor[transpose] =
-                  shmem_tensors[transpose_hero_count];
-              ++transpose_hero_count;
-            }
-
-            mlir::Value value = builder.create<ExtractOp>(
-                hero_to_shmem_tensor[transpose], shmem_indices);
-            auto result_scalars = EmitEpilogue(computations, entry_function,
-                                               value, root_indices, builder);
-            SmallVector<Value> results;
-            results.reserve(output_tensor_args.size());
-            for (auto [tensor, value] :
-                 llvm::zip(output_tensors, result_scalars)) {
-              results.push_back(
-                  builder.create<InsertOp>(value, tensor, root_indices));
-            }
-            return results;
-          });
-      result_tensors.append(subresult_tensors.begin(), subresult_tensors.end());
-    } else {
-      auto indexing = ComputeThreadIdToOutputIndexing(0, builder.getContext());
-      TF_RET_CHECK(indexing) << "Indexing is never nullopt";
-      auto subresult_tensors = EmitThreadLoopNest(
-          builder, output_tensor_args, *indexing,
-          [&](ValueRange output_tensors, ValueRange dim_values,
-              ValueRange symbol_values) -> SmallVector<Value> {
-            auto output_indices = ApplyAffineMap(
-                indexing->GetAffineMap(), dim_values, symbol_values, builder);
-
-            // Generate the operands for the root function: input tensors +
-            // output indices.
-            llvm::SmallVector<Value> operands(
-                entry_function.getArguments().take_front(num_inputs));
-            absl::c_copy(output_indices, std::back_inserter(operands));
-
-            auto result_scalars =
-                builder.create<PureCallOp>(call_targets(root), operands);
-
-            SmallVector<Value> results;
-            results.reserve(output_tensor_args.size());
-            for (auto [tensor, value] :
-                 llvm::zip(output_tensors, result_scalars.getResults())) {
-              results.push_back(
-                  builder.create<InsertOp>(value, tensor, output_indices));
-            }
-            return results;
-          });
-      result_tensors.append(subresult_tensors.begin(), subresult_tensors.end());
-    }
-  }
   builder.create<ReturnOp>(result_tensors);
   return absl::OkStatus();
 }
@@ -429,7 +339,7 @@ absl::Status MlirTransposeFusion::EmitReadFromShMemMlir(
 std::vector<const HloInstruction*>
 MlirTransposeFusion::GetInstructionsWithCustomCodegen(
     const HloFusionInstruction& fusion) const {
-  return {shmem_transposes_.begin(), shmem_transposes_.end()};
+  return GetShMemTransposes(analysis_);
 }
 
 absl::Status MlirTransposeFusion::EmitEntryFunction(
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
index 3df6073f5d924e..8329cdd852ae63 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir.h
@@ -17,15 +17,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/mlir/computation_partitioner.h"
 #include "xla/service/gpu/fusions/mlir/mlir_fusion_emitter.h"
@@ -52,16 +51,17 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   explicit MlirTransposeFusion(const HloFusionAnalysis& analysis);
   LaunchDimensions launch_dimensions() const override;
 
-  static bool IsSupported(const HloFusionAnalysis& analysis);
-
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t root_index, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* mlir_context) const override;
 
   std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
       int64_t root_index, int64_t hero_operand_index,
-      mlir::MLIRContext* ctx) const override;
+      mlir::MLIRContext* mlir_context) const override;
 
  protected:
+  IndexingMap ComputeThreadIdToInputIndexing(
+      const HloInstruction& hero, mlir::MLIRContext* mlir_context) const;
+
   absl::Status EmitEntryFunction(
       const mlir_converter::PartitionedComputations& computations,
       const mlir_converter::CallTargetProvider& call_targets,
@@ -87,7 +87,7 @@ class MlirTransposeFusion : public MlirFusionEmitterBase {
   const HloFusionAnalysis& analysis_;
   Tiling tiling_;
   Vector3 permutation_;
-  absl::flat_hash_set<const HloInstruction*> shmem_transposes_;
+  std::vector<const HloInstruction*> shmem_transposes_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
index e1d64067afb90a..2b3ddc04e5a3eb 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_mlir_test.cc
@@ -251,7 +251,6 @@ TEST_F(MlirTransposeFusionTest, Transpose021_NoEpilogue) {
     // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
     // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
     // CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : index
-
     // CHECK:      %[[SHMEM:.*]] = xla_gpu.allocate_shared : tensor<1x32x32xf32>
     // CHECK:      %[[SHMEM_WITH_VALS:.*]] = scf.for
     // CHECK-SAME:     %[[C0]] to %[[C8]] step %[[C1]]
@@ -285,6 +284,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_4D) {
         calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -303,6 +303,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_2D) {
         calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -328,6 +329,7 @@ TEST_F(MlirTransposeFusionTest, Transpose_2D_2) {
       ROOT %fusion = f32[2820,17]{1,0} fusion(%p0, %p1), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -352,6 +354,7 @@ TEST_F(MlirTransposeFusionTest, MultipleRootsForTranspose) {
         fusion(), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
@@ -369,6 +372,60 @@ TEST_F(MlirTransposeFusionTest, PartialTile) {
       ROOT %fusion = f64[6,4,2,24] fusion(%p0), kind=kInput, calls=%fused_computation
     }
   )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, MixedIndexing) {
+  auto kHloString = R"(
+    HloModule m
+
+    fused_computation {
+      %p0 = f64[24,2,6,4] parameter(0)
+      %bc = f64[24,2,24] bitcast(%p0)
+      %t1 = f64[6,4,2,24] transpose(%p0), dimensions={2,3,1,0}
+      %t2 = f64[24,2,24] transpose(%bc), dimensions={2,1,0}
+      %p1 = f64[] parameter(1)
+      %bc1 = f64[6,4,2,24] broadcast(%p1), dimensions={}
+      %bc2 = f64[24,2,24] broadcast(%p1), dimensions={}
+      %a1 = f64[6,4,2,24] add(%t1, %bc1)
+      %a2 = f64[24,2,24] add(%t2, %bc2)
+      ROOT %t = (f64[6,4,2,24], f64[24,2,24]) tuple(%a1, %a2)
+    }
+
+    ENTRY main {
+      %p0 = f64[24,2,6,4] parameter(0)
+      %p1 = f64[] parameter(1)
+      ROOT %fusion = (f64[6,4,2,24], f64[24,2,24]) fusion(%p0, %p1),
+        kind=kInput, calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
+}
+
+TEST_F(MlirTransposeFusionTest, SideOutputs) {
+  auto kHloString = R"(
+    HloModule m
+
+    fused_computation {
+      %p0 = f64[24,2,36] parameter(0)
+      %p1 = f64[36,2,24] parameter(1)
+      %tr = f64[36,2,24] transpose(%p0), dimensions={2,1,0}
+      %neg = f64[36,2,24] negate(%p1)
+      %log = f64[24,2,36] log(%p0)
+      ROOT %t = (f64[36,2,24], f64[36,2,24], f64[24,2,36])
+        tuple(%neg, %tr, %log)
+    }
+
+    ENTRY main {
+      %p0 = f64[24,2,36] parameter(0)
+      %p1 = f64[36,2,24] parameter(1)
+      ROOT %fusion = (f64[36,2,24], f64[36,2,24], f64[24,2,36])
+        fusion(%p0, %p1), kind=kInput, calls=%fused_computation
+    }
+  )";
+  TF_EXPECT_OK(EmitAndCheckIR(kHloString, "// CHECK: xla_gpu.allocate_shared"));
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1e-3}));
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index d7944a318a12e5..2fc6d15898da6c 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -137,7 +137,6 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        kTritonSoftmaxFusionKind,
                         ir_emitter_context.cuda_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitSoftMax,
@@ -154,6 +153,7 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
         triton_config.set_split_k(1);
         triton_config.set_num_stages(1);
         triton_config.set_num_warps(2);
+        triton_config.set_num_ctas(1);
       }
       TF_ASSIGN_OR_RETURN(
           TritonGemmConfig config,
@@ -164,13 +164,13 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
       TF_ASSIGN_OR_RETURN(
           triton_wrapper_result,
           TritonWrapper(analysis, impl_fn_name, hlo_computation,
-                        kTritonGemmFusionKind,
                         ir_emitter_context.cuda_compute_capability(),
                         ir_emitter_context.gpu_device_info(), config,
                         ir_emitter_context.llvm_module(), &EmitMatMul,
                         *ir_emitter_context.mlir_context()));
-      launch_dimensions =
-          GetMatMulLaunchDimensions(analysis, analysis_.fusion(), config);
+      TF_ASSIGN_OR_RETURN(
+          launch_dimensions,
+          GetMatMulLaunchDimensions(analysis, analysis_.fusion(), config));
     }
 
     llvm::Function* impl_fn =
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index a5ff6665031619..515a05f2ffb9d8 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -49,11 +48,11 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/scratch_allocator.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/proto/proto_utils.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/service/gpu/buffer_comparator.h"
@@ -187,6 +186,12 @@ class GemmAutotuner {
         -> absl::StatusOr<se::blas::ProfileResult> {
       se::OwningScratchAllocator<> scratch_allocator(
           stream_->parent()->device_ordinal(), autotune_config_.GetAllocator());
+      // Run a warmup iteration without the profiler active.
+      TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
+          stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
+          bias_buffer, aux_buffer, a_scale_buffer, b_scale_buffer,
+          c_scale_buffer, d_scale_buffer, d_amax_buffer, algorithm,
+          scratch_allocator));
       se::blas::ProfileResult profile_result;
       TF_RETURN_IF_ERROR(plan->ExecuteOnStream(
           stream_, lhs_buffer_, rhs_buffer_, output_buffer_, output_buffer_,
@@ -241,15 +246,6 @@ class GemmAutotuner {
 
     auto tuned_func = [&](const se::blas::AlgorithmType& algorithm)
         -> absl::StatusOr<se::blas::ProfileResult> {
-      // Do a warm-up run first, without a profile result. This avoids a timeout
-      // and error message if lazy module loading is enabled by ensuring that
-      // lazy loading happens outside the GpuTimer. RunGemm swallows error codes
-      // when profile_result is passed, as it is in the measurement below, but
-      // not otherwise. It is, therefore, consistent to ignore the error code
-      // here.
-      static_cast<void>(RunGemm(gemm_config, lhs_buffer_, rhs_buffer_,
-                                output_buffer_, workspace_buffer,
-                                deterministic_ops_, stream_, algorithm));
       se::blas::ProfileResult profile_result;
       // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will fail
       // for all algorithms if we're targeting < sm_50. But because we pass a
@@ -449,7 +445,7 @@ absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
                                       AutotuneConfig config) {
   bool changed = false;
   for (HloInstruction* instr : computation->instructions()) {
-    if (IsCublasGemm(*instr) || IsCublasLtMatmulF8(*instr)) {
+    if (IsCublasGemm(*instr)) {
       TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr, config));
       changed |= result;
     }
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.cc b/third_party/xla/xla/service/gpu/gemm_fusion.cc
index 999989208d73ce..05e758a73f3d47 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -162,14 +163,17 @@ struct HlosAndRequirements {
 HloInstruction& FuseDot(const HloDotInstruction& dot,
                         const HloInstruction& fused_lhs,
                         const HloInstruction& fused_rhs,
+                        std::optional<const HloInstruction*> fused_meta,
                         HloComputation::Builder& builder  // append
 ) {
-  CHECK_EQ(dot.operand_count(), 2);
   VLOG(3) << "Fusing " << dot.ToString();
 
-  std::array<HloInstruction*, 2> hlo_new_operands = {
+  std::vector<HloInstruction*> hlo_new_operands = {
       const_cast<HloInstruction*>(&fused_lhs),
       const_cast<HloInstruction*>(&fused_rhs)};
+  if (fused_meta.has_value()) {
+    hlo_new_operands.push_back(const_cast<HloInstruction*>(fused_meta.value()));
+  }
   return *builder.AddInstruction(
       dot.CloneWithNewOperands(dot.shape(), hlo_new_operands));
 }
@@ -477,15 +481,15 @@ HlosAndRequirements FuseTowardOperands(
 //
 // The return value contains the HLOs corresponding to the given dot operand and
 // the requirements corresponding to the whole fusion so far.
-HlosAndRequirements FuseDotOperand(
+absl::StatusOr<HlosAndRequirements> FuseDotOperand(
     const HloInstruction& dot, int operand_index,
     const se::GpuComputeCapability& gpu_version,
     HloComputation::Builder& builder,            // append
     std::vector<HloInstruction*>& fusion_params  // append
 ) {
   // Direct dot inputs have well defined dimension orders.
-  const FusionContext context =
-      FusionContext::FromDotOperand(dot, operand_index);
+  TF_ASSIGN_OR_RETURN(const FusionContext context,
+                      FusionContext::FromDotOperand(dot, operand_index));
   const HloInstruction& operand = *dot.operand(operand_index);
   return FuseTowardOperands(operand, context.dim_orders().at(&operand),
                             TritonFusionAnalysis::kMaxParameterPerDotOperand,
@@ -614,18 +618,43 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
     std::vector<HloInstruction*>& fusion_inputs,
     HloInstruction** fusion_output_ptr) {
   VLOG(5) << dot.ToString();
-  if (FusionDecision can_handle = CanTritonHandleGEMM(dot, gpu_version);
-      !can_handle) {
-    VLOG(3) << can_handle.Explain();
-    return can_handle;
-  }
-
-  HlosAndRequirements lhs_hlos_and_reqs = FuseDotOperand(
-      dot, /*operand_index=*/0, gpu_version, builder, fusion_inputs);
-  HlosAndRequirements rhs_hlos_and_reqs = FuseDotOperand(
-      dot, /*operand_index=*/1, gpu_version, builder, fusion_inputs);
-  HloInstruction& fused_dot = FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo,
-                                      *rhs_hlos_and_reqs.fused_hlo, builder);
+  if (CodegenDecision is_supported =
+          IsTritonSupportedInstruction(dot, gpu_version);
+      !is_supported) {
+    VLOG(3) << is_supported.Explain();
+    return is_supported;
+  }
+
+  // Verify sparse dot constraints.
+  if (dot.sparse_operands()) {
+    const SparsityDescriptor& descriptor = dot.sparsity().front();
+    if (dot.sparse_operands() != 1 || descriptor.index() != 0) {
+      return InvalidArgument("Sparsity is only supported on left operand");
+    }
+    if (descriptor.type() != SparsityType::SPARSITY_STRUCTURED_N_M ||
+        descriptor.n() != 2 || descriptor.m() != 4) {
+      return InvalidArgument("Only 2:4 structured sparsity is supported");
+    }
+    // DotDimensionSorter pass makes sure the sparse dimension is minor.
+    CHECK_EQ(descriptor.dimension(), dot.operand(0)->shape().rank() - 1);
+  }
+
+  TF_ASSIGN_OR_RETURN(HlosAndRequirements lhs_hlos_and_reqs,
+                      FuseDotOperand(dot, /*operand_index=*/0, gpu_version,
+                                     builder, fusion_inputs));
+  TF_ASSIGN_OR_RETURN(HlosAndRequirements rhs_hlos_and_reqs,
+                      FuseDotOperand(dot, /*operand_index=*/1, gpu_version,
+                                     builder, fusion_inputs));
+  std::optional<const HloInstruction*> meta_hlo;
+  if (dot.sparse_operands()) {
+    TF_ASSIGN_OR_RETURN(HlosAndRequirements meta_hlos_and_reqs,
+                        FuseDotOperand(dot, /*operand_index=*/2, gpu_version,
+                                       builder, fusion_inputs));
+    meta_hlo.emplace(meta_hlos_and_reqs.fused_hlo);
+  }
+  HloInstruction& fused_dot =
+      FuseDot(dot, *lhs_hlos_and_reqs.fused_hlo, *rhs_hlos_and_reqs.fused_hlo,
+              meta_hlo, builder);
   // For now the RHS doesn't support splits, so it also doesn't impose any
   // requirements.
   HlosAndRequirements fused_output_and_reqs =
@@ -642,7 +671,8 @@ absl::StatusOr<FusionDecision> CreateDotFusion(
       dot.precision_config().algorithm();
   if (algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6 ||
       algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3 ||
-      dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any()) {
+      dot.GetModule()->config().debug_options().xla_gpu_triton_gemm_any() ||
+      dot.sparse_operands()) {
     return FusionDecision{};
   }
 
@@ -703,9 +733,14 @@ class GemmFusionVisitor : public DfsHloRewriteVisitor {
     // If a GEMM requiring padding for cuBLAS is encountered here this
     // happened because earlier ShouldTritonHandleGEMM() accepted it and padding
     // was skipped. Accept it ignoring profitability checks.
-    if (!CublasRequiresPadding(*Cast<HloDotInstruction>(dot), gpu_version_) &&
-        !should_fuse) {
-      return absl::OkStatus();
+    // TODO(rocm): check ROCM padding requirements.
+    if (std::holds_alternative<se::CudaComputeCapability>(gpu_version_)) {
+      if (!CublasRequiresPadding(
+              *Cast<HloDotInstruction>(dot),
+              std::get<se::CudaComputeCapability>(gpu_version_)) &&
+          !should_fuse) {
+        return OkStatus();
+      }
     }
 
     HloComputation* computation =
@@ -751,108 +786,9 @@ absl::StatusOr<bool> RunOnComputation(
   return visitor.changed();
 }
 
-bool IsSupportedByTriton(
-    PrecisionConfig::Algorithm algorithm,
-    const se::CudaComputeCapability& cuda_compute_capability) {
-  switch (algorithm) {
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-      return true;
-
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      return cuda_compute_capability.IsAtLeastAmpere();
-
-    // TODO(b/326579472): Fix the support of this algorithm and maybe allow it
-    // here.
-    case PrecisionConfig::ALG_DOT_F16_F16_F32:
-    // Slow to compile:
-    case PrecisionConfig::ALG_DOT_F32_F32_F32:
-    default:
-      return false;
-  }
-}
 
 }  // namespace
 
-FusionDecision CanTritonHandleGEMM(
-    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
-  auto cuda_compute_capability =
-      std::get_if<se::CudaComputeCapability>(&gpu_version);
-
-  if (!cuda_compute_capability) return "Non CUDA device.";
-
-  if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
-    if (!tsl::tensor_float_32_execution_enabled() ||
-        absl::c_any_of(dot.precision_config().operand_precision(),
-                       [](int x) { return x != PrecisionConfig::DEFAULT; })) {
-      return "Non-default precision.";
-    }
-  } else {
-    if (!IsSupportedByTriton(dot.precision_config().algorithm(),
-                             *cuda_compute_capability)) {
-      return "Unsupported algorithm on the current device(s).";
-    }
-  }
-
-  auto supported_output_type = [&](const PrimitiveType t) {
-    switch (t) {
-      case F16:
-      case F32:
-        return true;
-      case BF16:
-        return cuda_compute_capability->IsAtLeast(
-            stream_executor::CudaComputeCapability::AMPERE);
-      default:
-        return false;
-    }
-  };
-
-  // TODO(b/266862493): Support more output types.
-  if (!supported_output_type(dot.shape().element_type())) {
-    return "Unsupported output data type.";
-  }
-
-  if (!IsTritonSupportedDataType(dot.operand(0)->shape().element_type(),
-                                 gpu_version) ||
-      !IsTritonSupportedDataType(dot.operand(1)->shape().element_type(),
-                                 gpu_version)) {
-    return "Unsupported input data type.";
-  }
-
-  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
-
-  // TODO(b/269580541): support multiple batch dimensions.
-  if (dim_numbers.lhs_batch_dimensions().size() > 1) {
-    return "Multiple batch dimensions.";
-  }
-
-  // Cases where lhs or rhs have no non-contracting dims are not handled.
-  if (dim_numbers.lhs_batch_dimensions().size() +
-              dim_numbers.lhs_contracting_dimensions().size() ==
-          dot.operand(0)->shape().rank() ||
-      dim_numbers.rhs_batch_dimensions().size() +
-              dim_numbers.rhs_contracting_dimensions().size() ==
-          dot.operand(1)->shape().rank()) {
-    return "No non-contracting dimensions.";
-  }
-
-  for (int operand_number = 0; operand_number <= 1; ++operand_number) {
-    // This pass relies on dot decomposer which ensures that all non-contracting
-    // dimensions are merged into one. Using NonContractingDimensionIndex is
-    // sufficient.
-    const int64_t nc_size =
-        dot.operand(operand_number)
-            ->shape()
-            .dimensions(NonContractingDimensionIndex(dot, operand_number));
-    if (nc_size <= 1) {
-      return "Trivial non-contracting dimensions.";
-    }
-  }
-
-  return FusionDecision{};
-}
-
 bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
                             const se::GpuComputeCapability& gpu_version) {
   std::vector<HloInstruction*> fusion_inputs;
@@ -865,6 +801,13 @@ bool ShouldTritonHandleGEMM(HloDotInstruction& dot,
 absl::StatusOr<bool> GemmFusion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version_);
+  if (!cuda_compute_capability || !cuda_compute_capability->IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion.h b/third_party/xla/xla/service/gpu/gemm_fusion.h
index 1ddf1bd850fc95..1138ad28a36a5f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/gemm_fusion.h
@@ -30,10 +30,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Filters GEMMs which can be handled using Triton.
-FusionDecision CanTritonHandleGEMM(const HloDotInstruction&,
-                                   const se::GpuComputeCapability&);
-
 // Filters GEMMs which are better to handle using Triton.
 bool ShouldTritonHandleGEMM(HloDotInstruction&,
                             const se::GpuComputeCapability&);
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
index a53affc736bb3c..58615453d7ea65 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -79,6 +80,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tools/hlo_decomposer.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -88,7 +90,6 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
-#include "tsl/util/proto/proto_utils.h"
 
 // Log levels used in this file:
 // VLOG(1): Overview
@@ -188,6 +189,9 @@ class GemmFusionAutotunerVisitor : public DfsHloRewriteVisitor {
 // This contains all alternative Triton GEMM configs related to one fusion.
 struct GemmConfigSet {
   std::vector<TritonGemmConfig> configs;
+  // Setting this to true disallows verification and fallback to cuBLAS, and
+  // the usage of cuDNN.
+  bool has_sparsity = false;
 };
 
 using CuDnnPlanId = int64_t;
@@ -238,11 +242,14 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
 
     if (backend_config.kind() == kTritonGemmFusionKind &&
         !backend_config.has_triton_gemm_config()) {
-      CHECK(
-          gemm_config_sets_.insert({fusion, GetGemmConfigSet(fusion)}).second);
+      TF_ASSIGN_OR_RETURN(GemmConfigSet gemm_config_set,
+                          GetGemmConfigSet(fusion));
+      TF_RET_CHECK(
+          gemm_config_sets_.insert({fusion, std::move(gemm_config_set)})
+              .second);
     } else if (backend_config.kind() == kCuDnnFusionKind &&
                !backend_config.has_cudnn_fusion_config()) {
-      CHECK(gemm_config_sets_.insert({fusion, {}}).second);
+      TF_RET_CHECK(gemm_config_sets_.insert({fusion, {}}).second);
     }
 
     handled_fusions_.insert(key);
@@ -254,15 +261,20 @@ class GemmConfigSetCollector : public ConstDfsHloVisitorWithDefault {
   }
 
  private:
-  GemmConfigSet GetGemmConfigSet(const HloFusionInstruction* fusion) {
+  absl::StatusOr<GemmConfigSet> GetGemmConfigSet(
+      const HloFusionInstruction* fusion) {
     const DebugOptions& debug_options =
         fusion->GetModule()->config().debug_options();
     auto cuda_comp =
         std::get<se::CudaComputeCapability>(config_.GetGpuComputeCapability());
-    return {GetPossibleMatmulAutotuneConfigs(
-        *Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
-            *fusion->called_computations().at(0), HloOpcode::kDot)),
-        cuda_comp, debug_options, config_.ExhaustiveTilingSearch())};
+    const HloDotInstruction* dot_instr =
+        Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+            *fusion->called_computations().at(0), HloOpcode::kDot));
+    TF_ASSIGN_OR_RETURN(auto configs, GetPossibleMatmulAutotuneConfigs(
+                                          *dot_instr, cuda_comp, debug_options,
+                                          config_.ExhaustiveTilingSearch()));
+    return GemmConfigSet{std::move(configs),
+                         /*has_sparsity=*/dot_instr->sparse_operands() > 0};
   }
 
   AutotuneConfig config_;
@@ -277,26 +289,32 @@ struct TileSizeLimit {
   int64_t block_k = 0;
 };
 
-TileSizeLimit GetUpperLimit(const HloDotInstruction& dot) {
+absl::StatusOr<TileSizeLimit> GetUpperLimit(const HloDotInstruction& dot) {
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index0,
+                      NonContractingDimensionIndex(dot, /*operand_number=*/0));
+  TF_ASSIGN_OR_RETURN(int64_t non_contracting_index1,
+                      NonContractingDimensionIndex(dot, /*operand_number=*/1));
+  TF_ASSIGN_OR_RETURN(int64_t contracting_index0,
+                      ContractingDimensionIndex(dot, /*operand_number=*/0));
   // This is not a sharp upper limit, the actual m value can be much smaller
   // based on how much of the m dimension is physically contiguous.
   // TODO(tdanyluk): Get the exact m value by running a TritonFusionAnalysis.
-  const int64_t m = dot.operand(0)->shape().dimensions(
-      NonContractingDimensionIndex(dot, /*operand_number=*/0));
+  const int64_t m = dot.operand(0)->shape().dimensions(non_contracting_index0);
   // Theoretically the same is true as for m, but that is not possible in
   // practice with the current implementation.
-  const int64_t n = dot.operand(1)->shape().dimensions(
-      NonContractingDimensionIndex(dot, /*operand_number=*/1));
+  const int64_t n = dot.operand(1)->shape().dimensions(non_contracting_index1);
   // This is before doing the split-k transform.
-  const int64_t k = dot.operand(0)->shape().dimensions(
-      ContractingDimensionIndex(dot, /*operand_number=*/0));
+  const int64_t k = dot.operand(0)->shape().dimensions(contracting_index0);
   const int64_t block_m_limit =
       std::max<int64_t>(tsl::NextPowerOfTwoS64(m), kMinTileSize);
   const int64_t block_n_limit =
       std::max<int64_t>(tsl::NextPowerOfTwoS64(n), kMinTileSize);
+  // Increase minimum tile size for the contracting dimension proportionally
+  // to the sparsity multiplier (assume 2:4 structured sparsity).
   const int64_t block_k_limit =
-      std::max<int64_t>(tsl::NextPowerOfTwoS64(k), kMinTileSize);
-  return {block_m_limit, block_n_limit, block_k_limit};
+      std::max<int64_t>(tsl::NextPowerOfTwoS64(k),
+                        kMinTileSize * (dot.sparse_operands() ? 2 : 1));
+  return TileSizeLimit{block_m_limit, block_n_limit, block_k_limit};
 }
 
 int64_t GetSplitKLimit(int64_t block_k, int64_t block_k_limit) {
@@ -315,11 +333,12 @@ constexpr std::array<int, 5> SPLIT_K = {1, 2, 4, 8, 16};
 // It's possible that some other values may be(come) supported.
 constexpr std::array<int, 5> NUM_CTAS = {1, 2, 4, 8, 16};
 
-std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>>
+GetExhaustiveMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability, const int max_split_k,
     const DebugOptions& debug_options) {
-  const TileSizeLimit limit = GetUpperLimit(dot);
+  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
   std::vector<TritonGemmConfig> configs;
   bool mma_layout_v2 =
       compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE);
@@ -345,6 +364,12 @@ std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
             if (block_k > limit.block_k) {
               continue;
             }
+            // Sparse meta should have at least one element per thread.
+            // Note: only 2:4 structured sparsity is currently supported.
+            if (dot.sparse_operands() &&
+                block_m * block_k / 16 < num_warps * WarpSize()) {
+              continue;
+            }
             for (int split_k : SPLIT_K) {
               if (split_k >
                   std::min<int64_t>(max_split_k,
@@ -419,9 +444,9 @@ std::vector<TritonGemmConfig> GetFixedMatmulAutotuneConfigs(
 }
 
 // This prefers to take the parameter by moving it.
-std::vector<TritonGemmConfig> ReduceTileSizes(
+absl::StatusOr<std::vector<TritonGemmConfig>> ReduceTileSizes(
     const HloDotInstruction& dot, std::vector<TritonGemmConfig> configs) {
-  const TileSizeLimit limit = GetUpperLimit(dot);
+  TF_ASSIGN_OR_RETURN(const TileSizeLimit limit, GetUpperLimit(dot));
   // Decrease the block sizes and split_k if they are unnecessarily big.
   for (TritonGemmConfig& config : configs) {
     config.block_m = std::min<int64_t>(config.block_m, limit.block_m);
@@ -429,6 +454,13 @@ std::vector<TritonGemmConfig> ReduceTileSizes(
     config.block_k = std::min<int64_t>(config.block_k, limit.block_k);
     config.split_k = std::min<int64_t>(
         config.split_k, GetSplitKLimit(config.block_k, limit.block_k));
+    // Sparse meta should have at least one element per thread.
+    // Note: only 2:4 structured sparsity is currently supported.
+    if (dot.sparse_operands()) {
+      int meta_elements = config.block_m * config.block_k / 16;
+      config.num_warps =
+          std::min<int64_t>(config.num_warps, meta_elements / WarpSize());
+    }
   }
 
   // Remove duplicates.
@@ -438,7 +470,7 @@ std::vector<TritonGemmConfig> ReduceTileSizes(
                                  return !configs_so_far.insert(config).second;
                                }),
                 configs.end());
-  CHECK(!configs.empty());
+  TF_RET_CHECK(!configs.empty());
   return configs;
 }
 
@@ -550,8 +582,6 @@ absl::StatusOr<std::unique_ptr<HloModule>> CudnnGemmAutotuneExtractor(
   TF_RETURN_IF_ERROR(
       new_module->entry_computation()->root_instruction()->set_backend_config(
           gpu_config));
-  CuDnnFusionCompiler compiler(autotune_config);
-  TF_RETURN_IF_ERROR(compiler.Run(new_module.get()).status());
 
   return new_module;
 }
@@ -571,18 +601,19 @@ bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind) {
 
 int GetCuDnnPlanCount(const HloInstruction& hlo,
                       const AutotuneConfig& autotune_config) {
-  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
-  if (!gpu_config.ok() ||
+  if (auto gpu_config = hlo.backend_config<GpuBackendConfig>();
+      !gpu_config.ok() ||
       gpu_config->fusion_backend_config().has_cudnn_fusion_config()) {
     return {};
   }
-  return CuDnnFusionCompiler(autotune_config)
-      .GetAvailablePlanCount(*DynCast<HloFusionInstruction>(&hlo));
+  return CuDnnFusionCompiler::GetAvailablePlanCount(
+      *autotune_config.GetExecutor(), *DynCast<HloFusionInstruction>(&hlo));
 }
 
 bool IsCuDnnEnabled(const AutotuneConfig& config,
                     const DebugOptions& debug_opts) {
-  return std::get<se::CudaComputeCapability>(config.GetGpuComputeCapability())
+  return !config.IsDeviceless() &&
+         std::get<se::CudaComputeCapability>(config.GetGpuComputeCapability())
              .IsAtLeastHopper() &&
          debug_opts.xla_gpu_cudnn_gemm_fusion_level() > 0 &&
          GetDnnVersionInfo(config.GetExecutor()).major_version() >= 9;
@@ -632,16 +663,16 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
 
     if (IsFusionKind(hlo, kTritonGemmFusionKind)) {
       config_count += gemm_config_set.configs.size();
-      if (IsCuDnnEnabled(config, debug_opts) &&
+      if (!gemm_config_set.has_sparsity && IsCuDnnEnabled(config, debug_opts) &&
           HasAlgorithmSupportedByCudnn(hlo)) {
         config_count += GetCuDnnPlanCount(hlo, config);
       }
     } else if (IsFusionKind(hlo, kCuDnnFusionKind)) {
       config_count += GetCuDnnPlanCount(hlo, config);
     }
+    // Reference config for verification (uses cuBLAS).
+    config_count += !gemm_config_set.has_sparsity;
   }
-  // cuBLAS configs: one per fusion.
-  config_count += gemm_config_sets.size();
 
   std::atomic<int> done_count = 0;
   std::atomic<int> good_count = 0;
@@ -756,16 +787,19 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         });
       }
 
-      thread_pool->Schedule([&, fusion] {
-        absl::StatusOr<bool> has_executable =
-            compile_reference_executable(fusion);
-        TF_CHECK_OK(has_executable.status());
-        log(has_executable.value());
-        counter.DecrementCount();
-      });
+      if (!gemm_config_set.has_sparsity) {
+        thread_pool->Schedule([&, fusion] {
+          absl::StatusOr<bool> has_executable =
+              compile_reference_executable(fusion);
+          TF_CHECK_OK(has_executable.status());
+          log(has_executable.value());
+          counter.DecrementCount();
+        });
+      }
 
       if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
           (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
+           !gemm_config_set.has_sparsity &&
            IsCuDnnEnabled(config, debug_opts) &&
            HasAlgorithmSupportedByCudnn(*fusion))) {
         const int plan_count = GetCuDnnPlanCount(*fusion, config);
@@ -803,12 +837,15 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
         log(has_executable);
       }
 
-      TF_ASSIGN_OR_RETURN(bool has_executable,
-                          compile_reference_executable(fusion));
-      log(has_executable);
+      if (!gemm_config_set.has_sparsity) {
+        TF_ASSIGN_OR_RETURN(bool has_executable,
+                            compile_reference_executable(fusion));
+        log(has_executable);
+      }
 
       if (IsFusionKind(*fusion, kCuDnnFusionKind) ||
           (IsFusionKind(*fusion, kTritonGemmFusionKind) &&
+           !gemm_config_set.has_sparsity &&
            IsCuDnnEnabled(config, debug_opts) &&
            HasAlgorithmSupportedByCudnn(*fusion))) {
         const int plan_count = GetCuDnnPlanCount(*fusion, config);
@@ -864,11 +901,10 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     input_shapes.push_back(param->shape());
   }
 
-  // Run with cuBLAS.
+  // Run with cuBLAS (optional).
   std::optional<ScopedShapedBuffer> reference_buffer;
-  absl::Duration cublas_duration;
-  {
-    TF_RET_CHECK(executable_set.reference != nullptr);
+  absl::Duration cublas_duration = absl::InfiniteDuration();
+  if (executable_set.reference != nullptr) {
     TF_ASSIGN_OR_RETURN(std::optional<ProfilingOutput> output,
                         util.ProfileExecutable(&*executable_set.reference,
                                                stream, inputs, input_shapes));
@@ -925,7 +961,9 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
     *res.mutable_run_time() =
         tsl::proto_utils::ToDurationProto(profiling_output->duration);
 
-    if (config.should_check_correctness()) {
+    // Reference buffer is available when `config.should_check_correctness()`
+    // is set and reference executable was compiled.
+    if (reference_buffer.has_value()) {
       TF_ASSIGN_OR_RETURN(
           se::RedzoneAllocator::RedzoneCheckStatus rz_check_status,
           rz_allocator.CheckRedzones());
@@ -1100,7 +1138,7 @@ absl::Status Autotune(
 
 }  // anonymous namespace
 
-std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
     const se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search) {
@@ -1157,7 +1195,13 @@ absl::StatusOr<bool> GemmFusionAutotuner::Run(
       if (IsFusionKind(*fusion, kCuDnnFusionKind)) {
         res.mutable_algorithm()->set_algo_id(-1);
       } else {
-        *res.mutable_triton() = kDefaultGemmTiling.ToProto();
+        const HloDotInstruction* dot_instr =
+            Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
+                *fusion->called_computations().at(0), HloOpcode::kDot));
+        TF_ASSIGN_OR_RETURN(auto configs,
+                            ReduceTileSizes(*dot_instr, {kDefaultGemmTiling}));
+        auto config = configs.front();
+        *res.mutable_triton() = config.ToProto();
       }
       *res.mutable_run_time() =
           tsl::proto_utils::ToDurationProto(absl::ZeroDuration());
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
index 318875ae9872a8..18a6e1b76caa44 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner.h
@@ -55,7 +55,7 @@ class GemmFusionAutotuner : public HloModulePass {
 
 // TODO(b/266210099): have a way to generate/load these dynamically.
 // Returns a list of possible tilings for a GEMM performed in Triton.
-std::vector<TritonGemmConfig> GetPossibleMatmulAutotuneConfigs(
+absl::StatusOr<std::vector<TritonGemmConfig>> GetPossibleMatmulAutotuneConfigs(
     const HloDotInstruction& dot, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options, bool exhaustive_tiling_search = false);
 
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
index 4d886a8f68988d..f8def9455b2c1f 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_autotuner_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gemm_fusion.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_pass_pipeline.h"
@@ -229,27 +230,6 @@ class GemmFusionAutotunerTestWithMorePreciseReduction
   }
 };
 
-TEST_F(GemmFusionAutotunerTest, VoltaUsesNoMoreThanTwoStages) {
-  std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = f32[1024,1024] parameter(0)
-  p1 = f32[1024,1024] parameter(1)
-  ROOT r = f32[1024,1024] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})")
-                                                  .value();
-  const se::CudaComputeCapability compute_capability{
-      se::CudaComputeCapability::VOLTA, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
-      GetPossibleMatmulAutotuneConfigs(
-          *Cast<HloDotInstruction>(
-              module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
-  EXPECT_FALSE(std::any_of(
-      configs.begin(), configs.end(),
-      [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
-}
-
 TEST_F(GemmFusionAutotunerTest, AmpereUsesMoreThanTwoStages) {
   std::unique_ptr<VerifiedHloModule> module = ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -261,11 +241,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.num_stages > 2; }));
@@ -282,11 +263,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k >= 16; }));
@@ -303,11 +285,12 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_FALSE(std::any_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k > 1; }));
@@ -361,10 +344,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, SelectsSplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   // Shapes with K >> M, N have to force split-K configurations.
   const std::string kHloText = R"(
 HloModule t
@@ -390,10 +369,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTestWithMorePreciseReduction, SelectsSplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   // Shapes with K >> M, N have to force split-K configurations.
   constexpr absl::string_view kHloText = R"(
 HloModule t
@@ -463,10 +438,6 @@ ENTRY %e {
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
-  }
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
   EXPECT_THAT(
       backend().compiler()->RunBackend(std::move(module),
@@ -502,10 +473,6 @@ ENTRY %e {
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "Not enough shared memory to run big tiles before Ampere.";
-  }
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
   HloModuleConfig config = module->config();
   DebugOptions debug_options = config.debug_options();
@@ -589,10 +556,6 @@ ENTRY e {
 }
 
 TEST_F(GemmFusionAutotunerTest, AutotuneCuDnnFusion) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "cuDNN fusion autotuning is not tested before Ampere.";
-  }
   const std::string kHlo = R"(
 fusion1 {
   p0 = f32[3,28,32] parameter(0)
@@ -694,7 +657,7 @@ ENTRY e {
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"16","block_k":"16","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}},"force_earliest_schedule":false}
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
@@ -760,16 +723,50 @@ ENTRY e {
                                                   .value();
   const se::CudaComputeCapability compute_capability{
       se::CudaComputeCapability::AMPERE, /*minor=*/0};
-  const std::vector<TritonGemmConfig> configs =
+  TF_ASSERT_OK_AND_ASSIGN(
+      const std::vector<TritonGemmConfig> configs,
       GetPossibleMatmulAutotuneConfigs(
           *Cast<HloDotInstruction>(
               module->entry_computation()->root_instruction()),
-          compute_capability, GetDebugOptionsForTest());
+          compute_capability, GetDebugOptionsForTest()));
   EXPECT_TRUE(std::all_of(
       configs.begin(), configs.end(),
       [](const TritonGemmConfig& config) { return config.split_k == 1; }));
 }
 
+class GemmFusionAutotunerConfigTest
+    : public StatelessAutotunerTest,
+      public ::testing::WithParamInterface<bool> {};
+
+TEST_P(GemmFusionAutotunerConfigTest, SparseDotDiscardsUnsupportedTiles) {
+  const std::string kHloText = R"(
+HloModule test
+ENTRY wais {
+  lhs = f16[5,1600] parameter(0)
+  rhs = f16[3200,10] parameter(1)
+  meta = u16[5,200] parameter(2)
+  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+  auto dot =
+      Cast<HloDotInstruction>(module->entry_computation()->root_instruction());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto configs,
+      GetPossibleMatmulAutotuneConfigs(
+          *dot, se::CudaComputeCapability{8, 0}, GetDebugOptionsForTest(),
+          /*exhaustive_tiling_search=*/GetParam()));
+  for (const auto& config : configs) {
+    int metadata_size = config.block_m * config.block_k / 16;
+    EXPECT_LE(config.num_warps * WarpSize(), metadata_size);
+    EXPECT_GT(config.block_k, 16);  // kMinTileSize
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(GemmFusionAutotunerConfigSweep,
+                         GemmFusionAutotunerConfigTest, ::testing::Bool());
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
index 43c1c155fd4189..e5986c9968b5ea 100644
--- a/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -163,30 +165,23 @@ ENTRY e {
   EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, DoNotTriggerWhenTheLhsNoncontractingDimIs1) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = s8[1,256] parameter(0)
-  p0c = f16[1,256] convert(p0)
-  p1 = f16[256,512] parameter(1)
-  ROOT r = f16[1,512] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})"));
-  EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
-}
+TEST_F(GemmFusionTest, FuseDotWithTrivialNoncontractingDim) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
 
-TEST_F(GemmFusionTest, DoNotTriggerWhenTheRhsNoncontractingDimIs1) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
 ENTRY e {
-  p0 = s8[128,256] parameter(0)
-  p0c = f16[128,256] convert(p0)
-  p1 = f16[256,1] parameter(1)
-  ROOT r = f16[128,1] dot(p0c, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-})"));
-  EXPECT_FALSE(GemmFusion(gpu_version_).Run(module.get()).value());
+  p0 = s8[60,5] parameter(0)
+  r0 = s8[3,20,5] reshape(p0)
+  c0 = f16[3,20,5] convert(r0)
+  p1 = f16[3,1,20] parameter(1)
+  ROOT d = f16[3,5,1] dot(c0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={2},
+    lhs_batch_dims={0}, rhs_batch_dims={0}
+})")
+                    .value();
+  EXPECT_TRUE(GemmFusion(gpu_version_).Run(module.get()).value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
 }
 
 TEST_F(GemmFusionTest, HandleDotIfCublasRequiresPadding) {
@@ -201,7 +196,7 @@ ENTRY e {
     lhs_contracting_dims={0}, rhs_contracting_dims={0}
 })"));
 
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::VOLTA, 0};
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
   EXPECT_TRUE(CublasRequiresPadding(
       *xla::Cast<HloDotInstruction>(
           module->entry_computation()->root_instruction()),
@@ -222,7 +217,7 @@ ENTRY e {
   ROOT t = tuple(d, s1)
 })"));
 
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::VOLTA, 0};
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::AMPERE, 0};
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
@@ -766,7 +761,7 @@ e {
                                     m::Parameter(), m::Parameter()))));
 }
 
-TEST_F(GemmFusionLevel2Test, FusionLevelIsLimitedOnVolta) {
+TEST_F(GemmFusionLevel2Test, GemmFusionBailsOutPreAmpere) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
 ENTRY e {
@@ -777,12 +772,13 @@ ENTRY e {
   ROOT dot = f32[2,2] dot(p0e, p1c),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })"));
-  EXPECT_TRUE(
+  EXPECT_THAT(
       GemmFusion(se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0})
-          .Run(module.get())
-          .value());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch((m::Fusion(m::Exp(), m::Parameter()))));
+          .Run(module.get()),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 TEST_F(GemmFusionLevel2Test, ParameterUsedElementwiseTwiceIsFused) {
@@ -1148,6 +1144,63 @@ ENTRY e {
 })");
 }
 
+class SparseDotTest : public GemmFusionTest {};
+
+TEST_F(SparseDotTest, DotWithSparseLhsOperandIsRewritten) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,16] parameter(0)
+  rhs = f16[32,2] parameter(1)
+  meta = u16[2,2] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})")
+                    .value();
+  EXPECT_TRUE(GemmFusion(gpu_version_).Run(module.get()).value());
+
+  MatchHloModule(*module, R"(
+; CHECK-LABEL: ENTRY %main ({{.*}}: f16[2,16], {{.*}}: f16[32,2], {{.*}}: u16[2,2]) -> f32[2,2] {
+; CHECK-NEXT: [[P0:%[^ ]+]] = f16[2,16]{1,0} parameter(0)
+; CHECK-NEXT: [[P1:%[^ ]+]] = f16[32,2]{1,0} parameter(1)
+; CHECK-NEXT: [[META:%[^ ]+]] = u16[2,2]{1,0} parameter(2)
+; CHECK:      ROOT {{.*}} = f32[2,2]{1,0}
+; CHECK-SAME:   fusion(f16[2,16]{1,0} [[P0]], f16[32,2]{1,0} [[P1]], u16[2,2]{1,0} [[META]]),
+; CHECK-SAME:   kind=kCustom
+; CHECK-SAME:   __triton_gemm
+})");
+}
+
+TEST_F(SparseDotTest, DotWithSparseRhsOperandIsNotSupported) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,32] parameter(0)
+  rhs = f16[16,2] parameter(1)
+  meta = u16[2,2] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=R.0@2:4
+})")
+                    .value();
+  auto result = GemmFusion(gpu_version_).Run(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
+TEST_F(SparseDotTest, UnsupportedSparsityType) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule test
+ENTRY main {
+  lhs = f16[2,8] parameter(0)
+  rhs = f16[32,2] parameter(1)
+  meta = u16[2,1] parameter(2)
+  ROOT dot = f32[2,2] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@1:4
+})")
+                    .value();
+  auto result = GemmFusion(gpu_version_).Run(module.get());
+  EXPECT_FALSE(result.ok());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
index 82860ef6324fa6..0aa610fc92f335 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter.cc
@@ -821,8 +821,13 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
                                         .WithOneUser()))
                       .WithOneUser(),
                   m::Op(&bias).WithPredicate(is_not_broadcast)))) {
-      return FuseMatrixBiasAdd(instr, bias, existing_gemm,
-                               optional_bitcast_matrix, optional_slice_matrix);
+      // The matrix bias must not be FP8, see
+      // https://docs.nvidia.com/cuda/cublas/index.html.
+      if (!IsF8Type(bias)) {
+        return FuseMatrixBiasAdd(instr, bias, existing_gemm,
+                                 optional_bitcast_matrix,
+                                 optional_slice_matrix);
+      }
     }
 
     return absl::OkStatus();
@@ -922,8 +927,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
 #endif  // TENSORFLOW_USE_ROCM
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     PrimitiveType a_type = a->shape().element_type();
     PrimitiveType b_type = b->shape().element_type();
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
     // cuBLASLt FP8 GEMM kernels require one of the two operands to be in
     // F8E4M3FN format.
@@ -1612,6 +1619,16 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return absl::OkStatus();
     }
 
+#if CUDA_VERSION < 12040
+    // For CUDA versions less than 12.3.2, cuBLAS LT returns
+    // CUBLAS_STATUS_NOT_SUPPORTED in some cases when fusing gelu into an FP8
+    // matmul. We cannot check the patch version, so disable this fusion with
+    // CUDA versions less than 12.4.
+    if (IsCublasLtMatmulF8(*gemm)) {
+      return absl::OkStatus();
+    }
+#endif
+
     // There are four users of the gemm output within the GELU calculation.
     bool has_aux = gemm->user_count() > 4;
 
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc
deleted file mode 100644
index a72c8255d348bc..00000000000000
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_test.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/gemm_rewriter.h"
-
-#include <gtest/gtest.h>
-#include "absl/strings/string_view.h"
-#include "xla/autotuning.pb.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla::gpu {
-namespace {
-
-class GemmRewriterTest : public HloTestBase {
- public:
-  GemmRewriterTest()
-      : HloTestBase(/*verifier_layout_sensitive=*/true,
-                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
-
-  DebugOptions GetDebugOptionsForTest() override {
-    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_cublaslt(false);
-    return debug_options;
-  }
-
-  se::GpuComputeCapability gpu_version_{
-      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0}};
-};
-
-TEST_F(GemmRewriterTest, MatrixVectorMultiplication) {
-  const char* hlo = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f32[2048] parameter(0)
-  p1 = f32[2048, 16384] parameter(1)
-  ROOT d = f32[16384] dot(p0, p1),
-    lhs_contracting_dims={0}, rhs_contracting_dims={0}
-})";
-
-  const char* expected = R"(
-// CHECK:  %[[P0:.+]] = f32[2048]{0} parameter(0)
-// CHECK:  %[[P1:.+]] = f32[2048,16384]{1,0} parameter(1)
-// CHECK:  %[[CUSTOM_CALL:.+]] = (f32[16384]{0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
-)";
-
-  RunAndFilecheckHloRewrite(hlo, GemmRewriter(gpu_version_), expected);
-}
-
-TEST_F(GemmRewriterTest, MatrixVectorMultiplicationWithBatch) {
-  const char* hlo = R"(
-HloModule m
-
-ENTRY e {
-  p0 = f32[10, 10, 2048] parameter(0)
-  p1 = f32[10, 10, 2048, 16384] parameter(1)
-  ROOT d = f32[10, 10, 16384] dot(p0, p1),
-   lhs_batch_dims={0, 1}, rhs_batch_dims={0, 1},
-   lhs_contracting_dims={2}, rhs_contracting_dims={2}
-})";
-
-  const char* expected = R"(
-// CHECK:  %[[P0:.+]] = f32[10,10,2048]{2,1,0} parameter(0)
-// CHECK:  %[[P1:.+]] = f32[10,10,2048,16384]{3,2,1,0} parameter(1)
-// CHECK:  %[[CUSTOM_CALL:.+]] = (f32[10,10,16384]{2,1,0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
-)";
-
-  RunAndFilecheckHloRewrite(hlo, GemmRewriter(gpu_version_), expected);
-}
-
-TEST_F(GemmRewriterTest, SparseDotNotSupported) {
-  const char* hlo = R"(
-HloModule test
-
-ENTRY main {
-  lhs = f16[5,16] parameter(0)
-  rhs = f16[32,10] parameter(1)
-  meta = u16[5,2] parameter(2)
-  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
-      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
-})";
-  auto hlo_pass = GemmRewriter(gpu_version_);
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&hlo_pass, module.get()));
-  EXPECT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter.cc b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
new file mode 100644
index 00000000000000..21e5f477e4b059
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.cc
@@ -0,0 +1,183 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Construct a new layout by adding a new minor-most dimension to the input
+// layout. For example, {3, 2, 1, 0} is extended to {4, 3, 2, 1, 0}.
+// We expect that the input layout is normalized by LayoutNormalizer, so that
+// the input layout has a descending ordering.
+absl::StatusOr<Layout> GetLayoutWithNewMinorMostDimension(
+    const Layout& layout) {
+  // Check that the layout is normalized.
+  if (!LayoutUtil::IsMonotonicWithDim0Major(layout)) {
+    return absl::InvalidArgumentError("Layout is not normalized.");
+  }
+  return LayoutUtil::MakeDescendingLayout(layout.minor_to_major_size() + 1);
+}
+
+class GemvRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDot(HloInstruction* instr) override {
+    HloDotInstruction* dot = Cast<HloDotInstruction>(instr);
+    const DotDimensionNumbers& dim_numbers = dot->dot_dimension_numbers();
+    HloInstruction* lhs = dot->mutable_operand(0);
+    HloInstruction* rhs = dot->mutable_operand(1);
+
+    // This pass relies on dot decomposer which ensures that all non-batch
+    // dimensions are merged into one.
+    bool lhs_has_non_contracting_dim =
+        lhs->shape().rank() ==
+        dim_numbers.lhs_batch_dimensions_size() +
+            dim_numbers.lhs_contracting_dimensions_size() + 1;
+    bool rhs_has_non_contracting_dim =
+        rhs->shape().rank() ==
+        dim_numbers.rhs_batch_dimensions_size() +
+            dim_numbers.rhs_contracting_dimensions_size() + 1;
+
+    // Skip matrix-matrix multiplication.
+    if (lhs_has_non_contracting_dim && rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
+    // Skip vector-vector multiplication.
+    if (!lhs_has_non_contracting_dim && !rhs_has_non_contracting_dim) {
+      return absl::OkStatus();
+    }
+
+    if (dot->shape().is_dynamic()) {
+      return absl::OkStatus();
+    }
+
+    changed_ = true;
+
+    HloComputation* computation = dot->parent();
+    HloInstruction* new_lhs = lhs;
+    if (!lhs_has_non_contracting_dim) {
+      const Shape& lhs_shape = lhs->shape();
+      absl::Span<const int64_t> lhs_dimensions = lhs_shape.dimensions();
+      std::vector<int64_t> new_lhs_dimensions(lhs_dimensions.begin(),
+                                              lhs_dimensions.end());
+      new_lhs_dimensions.push_back(1);
+      Shape new_lhs_shape(
+          lhs_shape.element_type(), new_lhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_lhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_lhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(lhs_shape.layout()));
+      new_lhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_lhs_shape, lhs));
+    }
+
+    HloInstruction* new_rhs = rhs;
+    if (!rhs_has_non_contracting_dim) {
+      const Shape& rhs_shape = rhs->shape();
+      absl::Span<const int64_t> rhs_dimensions = rhs_shape.dimensions();
+      std::vector<int64_t> new_rhs_dimensions(rhs_dimensions.begin(),
+                                              rhs_dimensions.end());
+      new_rhs_dimensions.push_back(1);
+      Shape new_rhs_shape(
+          rhs_shape.element_type(), new_rhs_dimensions,
+          absl::InlinedVector<bool, 4>(new_rhs_dimensions.size(), false),
+          /*tuple_shapes=*/{});
+      TF_ASSIGN_OR_RETURN(
+          *new_rhs_shape.mutable_layout(),
+          GetLayoutWithNewMinorMostDimension(rhs_shape.layout()));
+      new_rhs = computation->AddInstruction(
+          HloInstruction::CreateBitcast(new_rhs_shape, rhs));
+    }
+
+    std::vector<int64_t> new_out_dimensions;
+    new_out_dimensions.reserve(dot->shape().dimensions().size() + 1);
+    for (int64_t dim_size : dot->shape().dimensions()) {
+      new_out_dimensions.push_back(dim_size);
+    }
+    if (!lhs_has_non_contracting_dim) {
+      // Insert the trivial dimension before the non-contracting dimension from
+      // rhs.
+      int non_contracting_dim_size = new_out_dimensions.back();
+      new_out_dimensions[new_out_dimensions.size() - 1] = 1;
+      new_out_dimensions.push_back(non_contracting_dim_size);
+    } else {
+      new_out_dimensions.push_back(1);
+    }
+
+    Shape new_out_shape(
+        dot->shape().element_type(), new_out_dimensions,
+        absl::InlinedVector<bool, 4>(new_out_dimensions.size(), false),
+        /*tuple_shapes=*/{});
+    TF_ASSIGN_OR_RETURN(
+        *new_out_shape.mutable_layout(),
+        GetLayoutWithNewMinorMostDimension(dot->shape().layout()));
+
+    HloInstruction* new_dot =
+        computation->AddInstruction(HloInstruction::CreateDot(
+            new_out_shape, new_lhs, new_rhs, dot->dot_dimension_numbers(),
+            dot->precision_config()));
+    HloInstruction* bitcast = computation->AddInstruction(
+        HloInstruction::CreateBitcast(dot->shape(), new_dot));
+    return computation->ReplaceInstruction(dot, bitcast);
+  }
+
+  bool changed() const { return changed_; }
+
+ private:
+  bool changed_ = false;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemvRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  GemvRewriterVisitor gemv_rewriter;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_RETURN_IF_ERROR(computation->Accept(&gemv_rewriter));
+  }
+  return gemv_rewriter.changed();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusion_merger_triton.h b/third_party/xla/xla/service/gpu/gemv_rewriter.h
similarity index 52%
rename from third_party/xla/xla/service/gpu/fusion_merger_triton.h
rename to third_party/xla/xla/service/gpu/gemv_rewriter.h
index 72ab4d8e70dfa5..a041138b8af5c6 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger_triton.h
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,36 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
-#define XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
+#ifndef XLA_SERVICE_GPU_GEMV_REWRITER_H_
+#define XLA_SERVICE_GPU_GEMV_REWRITER_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 
 namespace xla {
 namespace gpu {
 
-// An HLO pass that attempts to merge producer fusions into triton softmax
-// fusions.
-//
-// Producer kernels are only merged if the resulting fusion can be correctly
-// tiled. If the result can be tiled, all operations from the auxiliary
-// producer fusion will be merged into the triton softmax computation, and this
-// computation will replace both the auxiliary and original triton softmax
-// fusion.
-//
-// Auxiliary fusions are not merged into consumer triton fusions if:
-// * The auxiliary fusion has multiple users
-// * The resulting merged fusion is not tilable
-class FusionMergerTriton : public HloModulePass {
+// Rewrite a matrix-vector or a vector-matrix multiplication into a
+// matrix-matrix multiplication with a trivial dimension. For example,
+// [m x n] @ [n] is rewritten to [m x n] @ [n x 1], and [n] @ [m x n] is
+// rewritten to [n x 1] @ [m x n].
+class GemvRewriter : public HloModulePass {
  public:
-  explicit FusionMergerTriton() = default;
-  absl::string_view name() const override { return "fusion-merger-triton"; }
+  absl::string_view name() const override { return "gemv-rewriter"; }
 
   using HloPassInterface::Run;
   absl::StatusOr<bool> Run(
@@ -52,4 +41,4 @@ class FusionMergerTriton : public HloModulePass {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_FUSION_MERGER_TRITON_H_
+#endif  // XLA_SERVICE_GPU_GEMV_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
new file mode 100644
index 00000000000000..2a8b8103e0a94e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/gemv_rewriter_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/gemv_rewriter.h"
+
+#include <memory>
+#include <optional>
+
+#include <gtest/gtest.h>
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+class GemvRewriterTest : public HloTestBase {};
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[32,7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[32,7]{1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[7]{0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[32,1]{1,0} dot(%[[P0]], %[[BITCAST]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteVectorMatrixMultiplicationToGemm) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[32] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[7]{0} parameter(0)
+// CHECK:  %[[BITCAST:.*]] = f32[7,1]{1,0} bitcast(%[[P0]])
+// CHECK:  %[[P1:.*]] = f32[7,32]{1,0} parameter(1)
+// CHECK:  %[[DOT:.*]] = f32[1,32]{1,0} dot(%[[BITCAST]], %[[P1]]), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+// CHECK:  ROOT %[[ROOT:.*]].1 = f32[32]{0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, RewriteMatrixVectorMultiplicationWithBatch) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[2,5,32,7] parameter(0)
+    p1 = f32[2,5,7] parameter(1)
+    ROOT d = f32[2,5,32] dot(p0, p1),
+      lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+      lhs_contracting_dims={3}, rhs_contracting_dims={2}
+  })";
+
+  const char* expected = R"()
+// CHECK:  %[[P0:.*]] = f32[2,5,32,7]{3,2,1,0} parameter(0)
+// CHECK:  %[[P1:.*]] = f32[2,5,7]{2,1,0} parameter(1)
+// CHECK:  %[[BITCAST:.*]] = f32[2,5,7,1]{3,2,1,0} bitcast(%[[P1]])
+// CHECK:  %[[DOT:.*]] = f32[2,5,32,1]{3,2,1,0} dot(%[[P0]], %[[BITCAST]]),
+// CHECK-SAME: lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+// CHECK:  ROOT %[[ROOT:.*]] = f32[2,5,32]{2,1,0} bitcast(%[[DOT]])
+})";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), expected);
+}
+
+TEST_F(GemvRewriterTest, DotNotRewriteVectorVectorMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[7] parameter(0)
+    p1 = f32[7] parameter(1)
+    ROOT d = f32[] dot(p0, p1),
+      lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
+TEST_F(GemvRewriterTest, DotNotRewriteMatrixMatrixMultiplication) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,7] parameter(0)
+    p1 = f32[7,32] parameter(1)
+    ROOT d = f32[5,32] dot(p0, p1),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  RunAndFilecheckHloRewrite(hlo, GemvRewriter(), /*expected=*/std::nullopt);
+}
+
+TEST_F(GemvRewriterTest, DoNotRewriteDotsWithNonNormalizedLayout) {
+  const char* hlo = R"(
+  HloModule m
+
+  ENTRY e {
+    p0 = f32[5,32,7]{2,1,0} parameter(0)
+    p1 = f32[5,7]{0,1} parameter(1)
+    ROOT d = f32[5,32]{0,1} dot(p0, p1),
+      lhs_batch_dims={0}, rhs_batch_dims={0},
+      lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+  GemvRewriter rewriter;
+  absl::StatusOr<bool> result = this->RunHloPass(&rewriter, module.get());
+  EXPECT_FALSE(result.ok());
+  EXPECT_EQ(result.status().message(), "Layout is not normalized.");
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d2acde8de1244d..63d1082da8ef9e 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -127,7 +127,6 @@ limitations under the License.
 #include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
-#include "xla/service/gpu/fusion_merger_triton.h"
 #include "xla/service/gpu/fusion_pipeline.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_broadcast_folding_rewriter.h"
@@ -165,12 +164,12 @@ limitations under the License.
 #include "xla/service/gpu/reduction_splitter.h"
 #include "xla/service/gpu/reduction_utils.h"
 #include "xla/service/gpu/rename_fusions.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime_intrinsics.h"
 #include "xla/service/gpu/scatter_slice_simplifier.h"
 #include "xla/service/gpu/softmax_rewriter_triton.h"
 #include "xla/service/gpu/stream_attribute_annotator.h"
 #include "xla/service/gpu/stream_attribute_async_wrapper.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/topk_specializer.h"
 #include "xla/service/gpu/topk_splitter.h"
 #include "xla/service/gpu/tree_reduction_rewriter.h"
@@ -391,13 +390,16 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
   static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
   FromModule(const HloModule* hlo_module,
              const BufferAssignment* buffer_assignment,
-             std::string_view asm_text, absl::Span<const uint8_t> binary) {
+             std::string_view asm_text, absl::Span<const uint8_t> binary,
+             const Thunk::BinaryMap& dnn_compiled_graphs) {
     CompilationResultProto proto;
     TF_ASSIGN_OR_RETURN(*proto.mutable_hlo_module_with_config(),
                         hlo_module->ToProtoWithConfig());
     *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
     proto.set_asm_text(std::string(asm_text));
     proto.set_binary(binary.data(), binary.size());
+    proto.mutable_dnn_compiled_graphs()->insert(dnn_compiled_graphs.cbegin(),
+                                                dnn_compiled_graphs.cend());
     return std::unique_ptr<GpuThunkAotCompilationResult>(
         new GpuThunkAotCompilationResult(hlo_module->Clone(),
                                          std::move(proto)));
@@ -506,6 +508,9 @@ GpuThunkAotCompilationResult::LoadExecutable(
       GpuExecutable::Create(GpuExecutable::Params{
           /*asm_text=*/proto_.asm_text(),
           /*binary=*/binary,
+          /*dnn_compiled_graphs=*/
+          Thunk::BinaryMap(proto_.dnn_compiled_graphs().cbegin(),
+                           proto_.dnn_compiled_graphs().cend()),
           /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
           /*executable=*/std::move(thunk_sequence),
           /*constants=*/std::move(constants),
@@ -959,6 +964,18 @@ absl::Status RunCollectiveOptimizationPasses(
           .debug_options()
           .xla_gpu_collective_permute_decomposer_threshold());
 
+  collectives_pipeline.AddPass<CollectivePermuteDecomposer>(
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_collective_permute_decomposer_threshold());
+
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_pipelined_collectives() ||
+      hlo_module->config().debug_options().xla_gpu_enable_pipelined_p2p()) {
+    AddP2PPipeliner(collectives_pipeline);
+  }
+
   // Run algebraic simplifier to reshape(broadcast) into a broadcast when
   // the reshape is just adding a unit dimension. This will help with the
   // AllGatherBroadcastReorder pass.
@@ -1024,12 +1041,6 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
                          .Run(hlo_module)
                          .status());
 
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_triton_softmax_fusion()) {
-    TF_RETURN_IF_ERROR(FusionMergerTriton().Run(hlo_module).status());
-  }
-
   if (hlo_module->config().debug_options().xla_gpu_collect_cost_model_stats()) {
     GpuHloCostAnalysis::Options cost_analysis_options{
         shape_size_fn,
@@ -1167,17 +1178,6 @@ absl::Status RunPostFusionCollectiveOptimizationPasses(HloModule* hlo_module) {
   };
   pipeline.AddPass<GpuAsyncCollectiveAnnotator>(convert_to_async);
 
-  pipeline.AddPass<CollectivePermuteDecomposer>(
-      hlo_module->config()
-          .debug_options()
-          .xla_gpu_collective_permute_decomposer_threshold());
-
-  if (hlo_module->config()
-          .debug_options()
-          .xla_gpu_enable_pipelined_collectives() ||
-      hlo_module->config().debug_options().xla_gpu_enable_pipelined_p2p()) {
-    AddP2PPipeliner(pipeline);
-  }
   return pipeline.Run(hlo_module).status();
 }
 
@@ -1286,6 +1286,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, options, gpu_target_config, thread_pool.get()));
 
+  // This is a "low effort, high impact" fusion that should be run first.
+  if (hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_address_computation_fusion()) {
+    HloPassPipeline pipeline("address-computation");
+    TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                        se::PlatformManager::PlatformWithId(PlatformId()));
+    pipeline.AddPass<AddressComputationFusionRewriter>(platform->Name());
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
   TF_RETURN_IF_ERROR(RunFusionPasses(hlo_module, gpu_target_config,
                                      thread_pool.get(),
                                      ShapeSizeBytesFunction()));
@@ -1375,6 +1386,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot,
                                        TransposeFolding::NeverFoldTranspose);
 
+    pipeline.AddPass<ReshapeDecomposer>();
     pipeline.AddPass<ReduceDecomposer>([&](const HloInstruction* r) {
       return IsReductionFromOrToContiguousDimensions(*r);
     });
@@ -1401,7 +1413,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // and may rewrite quantized FP8 GEMMs as higher-precision GEMMs.
     pipeline.AddPass<GemmRewriter>(gpu_version, /*f8_rewrite=*/true);
     if (debug_options.xla_gpu_enable_triton_gemm() && cuda_cc != nullptr &&
-        cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+        cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       pipeline.AddPass<GemmFusion>(gpu_version);
     }
     // Rewrite non-FP8 GEMMs.
@@ -1410,6 +1422,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
     pipeline.AddPass<GemmBroadcastFoldingRewriter>();
 
+    if (debug_options.xla_gpu_normalize_layouts()) {
+      pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
+    }
     pipeline.AddPass<BroadcastCanonicalizer>();
 
     pipeline.AddPass<ReductionDegenerateDimRemover>();
@@ -1420,7 +1435,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // harder.
     if (debug_options.xla_gpu_enable_triton_softmax_fusion() &&
         cuda_cc != nullptr &&
-        cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) {
+        cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
       pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
       pipeline.AddPass<SoftmaxRewriterTriton>(gpu_version);
     }
@@ -1691,12 +1706,12 @@ absl::StatusOr<std::unique_ptr<BufferAssignment>> GpuCompiler::AssignBuffers(
 using OutputInfoMap =
     absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
 
-static void NullDiagnosticHandler(const llvm::DiagnosticInfo& diag_info,
+static void NullDiagnosticHandler(const llvm::DiagnosticInfo* diag_info,
                                   void* context) {
   std::string error_string;
   llvm::raw_string_ostream string_printer(error_string);
   llvm::DiagnosticPrinterRawOStream diagnostic_printer(string_printer);
-  diag_info.print(diagnostic_printer);
+  diag_info->print(diagnostic_printer);
 
   VLOG(5) << error_string;
 }
@@ -1973,6 +1988,12 @@ GpuCompiler::CompileToBackendResult(
 absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     const CompileOptions& options) {
+  Thunk::BinaryMap dnn_compiled_graphs;
+  if (stream_exec) {
+    TF_RETURN_IF_ERROR(RunCudnnFusionCompilerPass(module.get(), stream_exec,
+                                                  &dnn_compiled_graphs));
+  }
+
   const DebugOptions& debug_opts = module->config().debug_options();
   TF_ASSIGN_OR_RETURN(TargetConfig gpu_target_config,
                       GetTargetConfig(options, debug_opts, stream_exec));
@@ -2047,6 +2068,8 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
               ? std::string()
               : std::move(res.backend_result.asm_text),
           /*binary=*/std::move(res.backend_result.binary),
+          /*dnn_compiled_graphs=*/
+          std::move(dnn_compiled_graphs),
           /*gpu_version=*/gpu_device_info.gpu_compute_capability(),
           /*executable=*/std::move(res.compile_module_results.executable),
           /*constants=*/std::move(res.compile_module_results.constants),
@@ -2139,7 +2162,8 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         results.emplace_back(),
         GpuThunkAotCompilationResult::FromModule(
             module.get(), res.compile_module_results.buffer_assignment.get(),
-            res.backend_result.asm_text, res.backend_result.binary));
+            res.backend_result.asm_text, res.backend_result.binary,
+            res.backend_result.dnn_compiled_graphs));
   }
 
   return std::move(results);
@@ -2159,7 +2183,8 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
 
   return GpuThunkAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
-      gpu_executable->text(), gpu_executable->binary());
+      gpu_executable->text(), gpu_executable->binary(),
+      gpu_executable->dnn_compiled_graphs());
 }
 
 absl::Status GpuCompiler::RunPostSchedulingPipelines(
@@ -2194,6 +2219,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
         /*host_memory_offload_config=*/std::nullopt);
     HloRematerialization::RematerializationSizes sizes;
     pipeline.AddPass<HloRematerialization>(options, sizes);
+    pipeline.AddPass<StreamAttributeAnnotator>();
     pipeline.AddPass<OptimizationBarrierExpander>();
 
     TF_ASSIGN_OR_RETURN(bool changed, pipeline.Run(module));
@@ -2203,16 +2229,6 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     }
   }
 
-  if (module->config()
-          .debug_options()
-          .xla_gpu_enable_address_computation_fusion()) {
-    HloPassPipeline pipeline("address-computation");
-    TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                        se::PlatformManager::PlatformWithId(PlatformId()));
-    pipeline.AddPass<AddressComputationFusionRewriter>(platform->Name());
-    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
-  }
-
   {
     HloPassPipeline pipeline("fusion-wrapper");
     pipeline.AddPass<FusionWrapper>();
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 36ede049f2f76a..87824610328860 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -123,6 +123,7 @@ class GpuCompiler : public LLVMCompiler {
   struct BackendCompileResult {
     std::string asm_text;
     std::vector<uint8_t> binary;
+    Thunk::BinaryMap dnn_compiled_graphs;
   };
 
   // During compilation with device, stream_exec != null and autotune_results
@@ -165,6 +166,13 @@ class GpuCompiler : public LLVMCompiler {
     return absl::OkStatus();
   }
 
+  // Runs CUDNN fusion compiler pass.
+  virtual absl::Status RunCudnnFusionCompilerPass(
+      HloModule* module, se::StreamExecutor* stream_exec,
+      Thunk::BinaryMap* dnn_compiled_graphs) {
+    return absl::OkStatus();
+  }
+
   AlgebraicSimplifierOptions GetAlgebraicSimplifierOptions(
       const HloModuleConfig& config);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 19e9f8a2a7a5af..7539adf1933504 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -349,16 +349,21 @@ ENTRY main {
 )";
 
   HloModuleConfig config;
-  DebugOptions debug_options = GetDebugOptionsForTest();
-  config.set_debug_options(GetDebugOptionsForTest());
+  DebugOptions triton_enabled_debug_options = GetDebugOptionsForTest();
+  triton_enabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
+      false);
+  config.set_debug_options(triton_enabled_debug_options);
   config.set_replica_count(1);
   config.set_num_partitions(1);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_enabled_module,
                           GetOptimizedModule(std::move(module)));
-  debug_options.set_xla_gpu_enable_triton_gemm(false);
-  config.set_debug_options(debug_options);
+  DebugOptions triton_disabled_debug_options = GetDebugOptionsForTest();
+  triton_disabled_debug_options.set_xla_gpu_enable_address_computation_fusion(
+      false);
+  triton_disabled_debug_options.set_xla_gpu_enable_triton_gemm(false);
+  config.set_debug_options(triton_disabled_debug_options);
   TF_ASSERT_OK_AND_ASSIGN(module,
                           ParseAndReturnVerifiedModule(hlo_string, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_disabled_module,
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 5e78157c6342e8..25c03adb2e9c53 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -54,8 +54,8 @@ limitations under the License.
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/runtime/annotation.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
@@ -167,6 +167,7 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
     : Executable(std::move(params.debug_module)),
       text_(std::move(params.asm_text)),
       binary_(std::move(params.binary)),
+      dnn_compiled_graphs_(std::move(params.dnn_compiled_graphs)),
       gpu_version_(params.gpu_version),
       thunks_(std::move(params.executable)),
       execution_stream_ids_(has_module()
@@ -1015,7 +1016,8 @@ absl::Status GpuExecutable::ExecuteThunksOrXlaRuntime(
   ModuleIdentifier unique_id = has_module() ? module().unique_id() : -1;
 
   if (thunks_) {
-    Thunk::ExecutableSource executable_source = {text_, binary_};
+    Thunk::ExecutableSource executable_source = {text_, binary_,
+                                                 dnn_compiled_graphs_};
     int64_t collective_max_nchannels =
         has_module() ? module_config()
                            .debug_options()
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index 85d45fc3873ee6..e2e0daafe7554a 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -40,7 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/annotation.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/service_executable_run_options.h"
@@ -88,6 +88,7 @@ class GpuExecutable : public Executable {
   struct Params {
     std::string asm_text;
     std::vector<uint8_t> binary;
+    Thunk::BinaryMap dnn_compiled_graphs;
     se::GpuComputeCapability gpu_version;
     OwnedThunkSequence executable;
     std::vector<ConstantInfo> constants;
@@ -138,6 +139,10 @@ class GpuExecutable : public Executable {
   // compiled.
   const std::vector<uint8_t>& binary() const { return binary_; }
 
+  const Thunk::BinaryMap& dnn_compiled_graphs() const {
+    return dnn_compiled_graphs_;
+  }
+
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
@@ -241,6 +246,8 @@ class GpuExecutable : public Executable {
   // May be empty, in which case we leave compilation up to the GPU driver.
   std::vector<uint8_t> binary_;
 
+  Thunk::BinaryMap dnn_compiled_graphs_;
+
   // The GPU version for compute compatibility check.
   se::GpuComputeCapability gpu_version_;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 4c25424d0dc3dd..08725514ceecf0 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -317,41 +317,10 @@ int64_t GetPipelineStream(const HloInstruction& start) {
 // Returns the resource type and resource usage for a P2P instruction.
 std::pair<GpuResourceType, ResourceUsageType> GetP2PResourceAndUsage(
     const HloInstruction& instr, const CanonicalAsyncOp& op) {
-  ResourceUsageType usage;
-  int64_t pipeline = 0;
-  if (op.outer == HloOpcode::kAsyncStart) {
-    usage = ResourceUsageType::kResourceRelease;
-    pipeline = GetPipelineStream(instr);
-  } else {
-    usage = ResourceUsageType::kResourceOccupy;
-    // Check the operand for the Send-done or Recv-done instruction.
-    const HloInstruction* operand = instr.operand(0);
-    HloOpcode operand_opcode = operand->opcode();
-    if (operand_opcode == HloOpcode::kSend ||
-        operand_opcode == HloOpcode::kRecv) {
-      // Not a pipelined P2P.
-      pipeline = GetPipelineStream(*operand);
-    } else {
-      // A pipelined P2P. Find the corresponding start-op.
-      const HloSendRecvInstruction* start;
-      const HloGetTupleElementInstruction* gte =
-          DynCast<HloGetTupleElementInstruction>(operand);
-      int64_t tuple_index = gte->tuple_index();
-      if (gte->operand(0)->opcode() == HloOpcode::kWhile) {
-        // The op is a while-result, so the start-op should be a value in the
-        // while-op operands.
-        start = DynCast<HloSendRecvInstruction>(
-            gte->operand(0)->operand(0)->operand(tuple_index));
-      } else {
-        // The op is a while-body parameter, so the start-op should be a value
-        // in the while-body result.
-        const HloComputation* computation = instr.parent();
-        start = DynCast<HloSendRecvInstruction>(
-            computation->root_instruction()->operand(tuple_index));
-      }
-      pipeline = GetPipelineStream(*start);
-    }
-  }
+  ResourceUsageType usage = op.outer == HloOpcode::kAsyncStart
+                                ? ResourceUsageType::kResourceRelease
+                                : ResourceUsageType::kResourceOccupy;
+  int64_t pipeline = GetPipelineStream(instr);
   HloOpcode opcode = op.inner;
   GpuResourceType resource;
   if (pipeline == 0) {
@@ -399,6 +368,22 @@ class GpuAsyncTrackerBase : public AsyncTracker {
             !IsSyncCollective(&hlo)) ||
            IsAsyncComputeOp(hlo);
   }
+
+  void PostProcessScheduleGraph(
+      HloScheduleGraph* schedule_graph,
+      const LatencyEstimator* latency_estimator) const override {
+    for (auto inst : schedule_graph->GetOriginalInstrList()) {
+      if (inst->has_backend_config()) {
+        auto gpu_config = inst->backend_config<GpuBackendConfig>();
+        if (gpu_config.ok()) {
+          HloGraphNode& node = schedule_graph->GetNode(inst);
+          node.SetForceDelay(gpu_config->force_earliest_schedule());
+          VLOG(5) << "Setting force delay for instruction: "
+                  << inst->ToString();
+        }
+      }
+    }
+  }
 };
 
 // GPU async tracker maps all collectives onto an async stream resource.
@@ -523,26 +508,30 @@ class GpuAsyncTracker : public GpuAsyncTrackerBase {
     }
     auto find_instruction_for_pipeline = [&](HloOpcode opcode,
                                              int64_t pipeline) {
-      for (auto operand : instr.operand(0)->operands()) {
-        if (operand->opcode() == opcode) {
-          int64_t cur_pipeline = GetPipelineStream(*operand);
-          if (cur_pipeline == pipeline) {
-            return true;
+      for (auto user1 : instr.users()) {
+        if (user1->opcode() == HloOpcode::kGetTupleElement) {
+          for (auto user2 : user1->users()) {
+            if (user2->opcode() == opcode) {
+              if (GetPipelineStream(*user2) == pipeline) {
+                return true;
+              }
+            }
           }
         }
       }
       return false;
     };
     bool found;
-    // Look into the while-op init-values to find pipelined Send/Recv.
+    // Look into the users of the while-result to find pipelined Send-done or
+    // Recv-done.
     if (resource_type == first_p2p_resource) {
-      found = find_instruction_for_pipeline(HloOpcode::kSend, 0);
+      found = find_instruction_for_pipeline(HloOpcode::kSendDone, 0);
     } else if (resource_type == first_p2p_resource + 1) {
-      found = find_instruction_for_pipeline(HloOpcode::kSend, 1);
+      found = find_instruction_for_pipeline(HloOpcode::kSendDone, 1);
     } else if (resource_type == first_p2p_resource + 2) {
-      found = find_instruction_for_pipeline(HloOpcode::kRecv, 0);
+      found = find_instruction_for_pipeline(HloOpcode::kRecvDone, 0);
     } else {
-      found = find_instruction_for_pipeline(HloOpcode::kRecv, 1);
+      found = find_instruction_for_pipeline(HloOpcode::kRecvDone, 1);
     }
     return num_resources - (found ? 1 : 0);
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index 7eaf836d69f10e..aa669290f9c9d7 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -790,11 +790,17 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
     count = get-tuple-element(param), index=0
 
     recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=1
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     c1 = u32[] constant(1)
     new-count = u32[] add(count, c1)
@@ -852,10 +858,16 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     ROOT entry-result = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
   }
@@ -920,18 +932,30 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
     count = get-tuple-element(param), index=0
 
     recv.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0.q), channel_id=1
+    recv-done.0 = (f32[1,1024,1024], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = f32[1, 1024, 1024] get-tuple-element(recv-done.0), index=0
 
     send.0.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=2
-    send-done.0 = token[] send-done(send.0.q), channel_id=1
+    send-done.0 = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     recv.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=3
-    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=2
+    recv-done.1 = (f32[1,1024,1024], token[]) recv-done(recv.1.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
     recv-data.1 = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     send.1.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.q), channel_id=2
+    send-done.1 = token[] send-done(send.1.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -1021,18 +1045,30 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1,1024,1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
     recv-data.2 = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
 
     send.2.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+      _xla_send_recv_pipeline="0"
+      }
 
     recv.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=3
-    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3.q), channel_id=2
+    recv-done.3 = (f32[1,1024,1024], token[]) recv-done(recv.3.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
     recv-data.3 = f32[1, 1024, 1024] get-tuple-element(recv-done.3), index=0
 
     send.3.q = (f32[1,1024,1024], u32[], token[]) get-tuple-element(while-result), index=4
-    send-done.3 = token[] send-done(send.3.q), channel_id=2
+    send-done.3 = token[] send-done(send.3.q), channel_id=2,
+      frontend_attributes={
+      _xla_send_recv_pipeline="1"
+      }
 
     replica = u32[] replica-id()
     constant0 = u32[] constant(0)
@@ -1111,6 +1147,77 @@ ENTRY e {
 )"));
 }
 
+TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelWithForceEarliestSchedule) {
+  const char* hlo_text = R"(
+  HloModule AsyncAR
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY main {
+    p0 = f32[32] parameter(0)
+    p1 = f32[32, 32] parameter(1)
+    p2 = f32[32, 32] parameter(2)
+    p3 = f32[32] parameter(3)
+
+    // Independent compute
+    dot0 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm", backend_config={"force_earliest_schedule":true}
+    dot1 = f32[32,32]{1,0} custom-call(p1, p2), custom_call_target="__cublas$gemm"
+    add0 = f32[32,32] add(dot0, dot1)
+
+    // 2 Independent collectives.
+    ar-start = f32[32] all-reduce-start(p0), to_apply=apply_op
+    ar-done = f32[32] all-reduce-done(ar-start)
+
+    ROOT t = (f32[32], f32[32,32]) tuple(ar-done, add0)
+  })";
+
+  const std::string ar_long_latency_proto_text = R"pb(
+    costs { name: "dot0" cost_us: 100.0 }
+    costs { name: "dot1" cost_us: 100.0 }
+    costs { name: "add0" cost_us: 10.0 }
+    costs { name: "ar-start" cost_us: 1000.0 }
+  )pb";
+
+  tensorflow::profiler::ProfiledInstructionsProto profile;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      ar_long_latency_proto_text, &profile));
+  std::string ar_long_latency_proto_binary = profile.SerializeAsString();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      ParseAndReturnVerifiedModule(
+          hlo_text,
+          GetModuleConfig(/*enable_latency_hiding_scheduler=*/true,
+                          // Post processing should work even with
+                          // GpuAsyncTrackerBase.
+                          /*enable_gpu_async_tracker=*/false,
+                          /*fdo_profile=*/ar_long_latency_proto_binary)));
+  SequentialHloOrdering order = BuildHloOrdering(module.get());
+
+  const std::vector<HloInstruction*>& main =
+      order.SequentialOrder(*module->GetComputationWithName("main"))
+          ->instructions();
+  auto get_index =
+      [](absl::string_view hlo_name,
+         const std::vector<HloInstruction*>& instruction_sequence) {
+        return absl::c_find_if(instruction_sequence,
+                               [hlo_name](HloInstruction* instruction) {
+                                 return instruction->name() == hlo_name;
+                               }) -
+               instruction_sequence.begin();
+      };
+  // Using the profile, LHS should schedule all computes between ar pair,
+  // but since dot0 is marked as force delay, it should be scheduled
+  // before ar-start now.
+  EXPECT_LT(get_index("dot0", main), get_index("ar-start", main));
+  // Also verify that dot1 is scheduled between ar start and ar done.
+  EXPECT_GT(get_index("dot1", main), get_index("ar-start", main));
+  EXPECT_LT(get_index("dot1", main), get_index("ar-done", main));
+}
+
 class GpuHloScheduleParameterizedTest
     : public GpuHloScheduleTest,
       public ::testing::WithParamInterface<bool> {};
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
index a30a071e68b8c3..9aa622b3ea79f6 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment.cc
@@ -55,7 +55,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {
@@ -474,6 +474,19 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
           ShapeUtil::MoveDimToMajor(all_to_all->shape(),
                                     *all_to_all->split_dimension()),
           all_to_all));
+    } else if (instruction->opcode() == HloOpcode::kSend) {
+      Shape s = instruction->operand(0)->shape();
+      LayoutUtil::SetToDefaultLayout(&s);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(s, instruction->operand(0)));
+      TF_RETURN_IF_ERROR(
+          SetArrayOperandLayout(s.layout(), instruction->operand(0), 0));
+    } else if (instruction->opcode() == HloOpcode::kRecv) {
+      Shape s = instruction->shape();
+      ShapeUtil::ForEachMutableSubshape(
+          &s, [&](Shape* subshape, const ShapeIndex& index) {
+            LayoutUtil::SetToDefaultLayout(subshape);
+          });
+      TF_RETURN_IF_ERROR(SetInstructionLayout(s, instruction));
     }
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
index 9bde416b2393fe..0a92f45cacc8e0 100644
--- a/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -555,6 +555,67 @@ ENTRY main {
             LayoutUtil::MakeLayout({1, 3, 2, 0}).minor_to_major());
 }
 
+TEST_F(LayoutAssignmentTest, SendRcvLayout) {
+  const char* hlo = R"(
+HloModule Module
+
+condition  {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+    ROOT lt = pred[] constant(1)
+}
+
+body {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+
+    t1 = f32[100,100] get-tuple-element(p), index=0
+    t = (f32[100,100], u32[], token[]) get-tuple-element(p), index=1
+    sdone = token[] send-done(t), channel_id=3, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    tk = token[] after-all()
+
+
+    rcvd = (f32[100,100]{0,1}, u32[], token[]) recv(tk), channel_id=2
+    zz = (f32[100,100]{0,1}, token[]) recv-done(rcvd), channel_id=2
+
+    rcvd_d = get-tuple-element(zz), index=0
+
+    snd = (f32[100,100]{0,1}, u32[], token[]) send(t1, tk), channel_id=3, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    a = add(t1, t1)
+
+    b = add(rcvd_d, a)
+
+    ROOT tup =  tuple(b, snd)
+}
+
+ENTRY %main {
+    p0 = f32[100,100] parameter(0)
+    tk = token[] after-all()
+    snd = (f32[100,100]{0,1}, u32[], token[]) send(p0, tk), channel_id=1, frontend_attributes={
+      _xla_send_recv_pipeline="0"
+    }
+    t = tuple(p0, snd)
+    ROOT loop = while(t), condition=condition, body=body
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo));
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  RunAndFilecheckHloRewrite(
+      hlo,
+      GpuLayoutAssignment{&computation_layout, GetGpuComputeCapability(),
+                          GetDnnVersion()},
+      R"(
+// CHECK: (f32[100,100]{1,0}, u32[], token[]) recv
+// CHECK:  (f32[100,100]{1,0}, token[]) recv-done
+// CHECK:  (f32[100,100]{1,0}, u32[], token[]) send
+                                )");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
index 79c9187cfbde5a..faa9195bc37fc4 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -35,20 +35,28 @@ inline constexpr int64_t kCollectiveMemorySpaceColor = 1;
 // collective memory using ncclMemAlloc in the runtime.
 inline BufferAssigner::Colorer CollectiveColorer() {
   return [](HloAliasAnalysis* alias_analysis, const HloOrdering&) {
+    static const auto* kSupportedOpcodes = new absl::flat_hash_set<HloOpcode>{
+        HloOpcode::kAllReduce,
+        HloOpcode::kAllReduceStart,
+        HloOpcode::kAllReduceDone,
+        HloOpcode::kAllGather,
+        HloOpcode::kAllGatherStart,
+        HloOpcode::kAllGatherDone,
+        HloOpcode::kReduceScatter,
+        HloOpcode::kCollectivePermute,
+        HloOpcode::kCollectivePermuteStart,
+        HloOpcode::kCollectivePermuteDone,
+        HloOpcode::kAllToAll,
+    };
     for (HloValue* value : alias_analysis->dataflow_analysis().values()) {
       auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       for (const auto& alias : buffer.values()) {
-        if ((alias->instruction()->opcode() == HloOpcode::kAllReduce ||
-             alias->instruction()->opcode() == HloOpcode::kAllReduceStart ||
-             alias->instruction()->opcode() == HloOpcode::kAllReduceDone ||
-             alias->instruction()->opcode() == HloOpcode::kAllGather ||
-             alias->instruction()->opcode() == HloOpcode::kAllGatherStart ||
-             alias->instruction()->opcode() == HloOpcode::kAllGatherDone ||
-             alias->instruction()->opcode() == HloOpcode::kReduceScatter) ||
+        // opcode or async wrapped opcode is in kSupportedOpcodes.
+        if (kSupportedOpcodes->contains(alias->instruction()->opcode()) ||
             ((alias->instruction()->opcode() == HloOpcode::kAsyncStart ||
               alias->instruction()->opcode() == HloOpcode::kAsyncDone) &&
-             alias->instruction()->async_wrapped_opcode() ==
-                 HloOpcode::kReduceScatter)) {
+             kSupportedOpcodes->contains(
+                 alias->instruction()->async_wrapped_opcode()))) {
           value->set_color(kCollectiveMemorySpaceColor);
         }
       }
diff --git a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
index abefe7451c4568..7e71820ed87a14 100644
--- a/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_p2p_pipeliner_test.cc
@@ -93,7 +93,10 @@ TEST_F(GpuP2PPipelinerTest,
         _xla_send_recv_pipeline="0",
         _xla_send_recv_validation="{{1,7}}"
       }
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
@@ -102,7 +105,10 @@ TEST_F(GpuP2PPipelinerTest,
     r = u32[2] broadcast(c1), dimensions={}
     s = u32[2] add(r, recv-data)
 
-    send-done.0 = token[] send-done(send.0), channel_id=1
+    send-done.0 = token[] send-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     ROOT result = (u32[], u32[2]) tuple(new_count, s)
   }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
index 24ff8a18b8d9ce..fa5339fc007f44 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler.cc
@@ -62,6 +62,16 @@ absl::Status UpdateDotAndConsumerConfig(HloInstruction* dot,
   return absl::OkStatus();
 }
 
+absl::Status SetForceDelayForInstruction(HloInstruction* instr,
+                                         bool force_delay) {
+  auto gpu_config = instr->backend_config<gpu::GpuBackendConfig>();
+
+  gpu_config->set_force_earliest_schedule(force_delay);
+
+  TF_RETURN_IF_ERROR(instr->set_backend_config(gpu_config.value()));
+  return absl::OkStatus();
+}
+
 absl::StatusOr<bool> HandleRsWindowedEinsumLoop(HloComputation* comp,
                                                 int64_t stream_id) {
   bool changed = false;
@@ -82,6 +92,16 @@ absl::StatusOr<bool> HandleRsWindowedEinsumLoop(HloComputation* comp,
       ++stream_id;
       changed = true;
     }
+
+    // We need to enforce the first collective-permute to be always scheduled
+    // at the beginning of the loop.
+    HloInstruction* matched_cp;
+    if (Match(inst, m::CollectivePermute(
+                        &matched_cp, m::GetTupleElement(m::Parameter(), 2)))) {
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_cp, /*force_delay=*/true));
+      changed = true;
+    }
   }
   return changed;
 }
@@ -104,6 +124,18 @@ absl::StatusOr<bool> HandleAgWindowedEinsumLoop(HloComputation* comp,
       // Dispatch the dot to additional compute stream.
       TF_RETURN_IF_ERROR(UpdateDotAndConsumerConfig(matched_dot, stream_id));
       ++stream_id;
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_dot, /*force_delay=*/true));
+      changed = true;
+    }
+
+    // We need to enforce the first collective-permute to be always scheduled
+    // at the beginning of the loop.
+    HloInstruction* matched_cp;
+    if (Match(inst, m::CollectivePermute(
+                        &matched_cp, m::GetTupleElement(m::Parameter(), 0)))) {
+      TF_RETURN_IF_ERROR(
+          SetForceDelayForInstruction(matched_cp, /*force_delay=*/true));
       changed = true;
     }
   }
diff --git a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
index 06fa8b107067f6..c70fbf2b08d126 100644
--- a/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_windowed_einsum_handler_test.cc
@@ -106,8 +106,14 @@ ENTRY test_main {
       module->entry_computation()->root_instruction()->mutable_operand(0);
   HloComputation* ag_loop_body = ag_loop->while_body();
   HloInstruction* inst = FindInstructionByName(ag_loop_body, "dot.2");
-  EXPECT_TRUE(inst->backend_config<GpuBackendConfig>()->operation_queue_id() >
-              0);
+  EXPECT_GT(inst->backend_config<GpuBackendConfig>()->operation_queue_id(), 0);
+  EXPECT_TRUE(
+      inst->backend_config<GpuBackendConfig>()->force_earliest_schedule());
+
+  HloInstruction* cp1 =
+      FindInstructionByName(ag_loop_body, "collective-permute");
+  EXPECT_TRUE(
+      cp1->backend_config<GpuBackendConfig>()->force_earliest_schedule());
 }
 
 TEST_F(GpuWindowedEinsumHanlderTest, RsLoopsHaveStreamIds) {
@@ -180,6 +186,11 @@ ENTRY main.9_spmd {
   HloInstruction* inst = FindInstructionByName(rs_loop_body, "dot.7");
   EXPECT_TRUE(inst->backend_config<GpuBackendConfig>()->operation_queue_id() >
               0);
+
+  HloInstruction* cp1 =
+      FindInstructionByName(rs_loop_body, "collective-permute.1");
+  EXPECT_TRUE(
+      cp1->backend_config<GpuBackendConfig>()->force_earliest_schedule());
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index fa5bc0f81817fb..d77e669b48f162 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -68,6 +68,11 @@ H AbslHashValue(H h, const HloInstructionAdaptor& m) {
                     m.instruction_->unique_id());
 }
 
+template <HloOpcode op, HloOpcode... rest>
+bool IsOpcodeAnyOf(const HloInstructionAdaptor& adaptor) {
+  return (adaptor.opcode() == op) || ((adaptor.opcode() == rest) || ...);
+}
+
 class HloFusionAdaptor {
  public:
   virtual ~HloFusionAdaptor() = default;
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
index 4ca50f4538cf6b..c7e3f0db3b7b47 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal_test.cc
@@ -226,7 +226,6 @@ TEST_F(HloTraversalTest, FindIf) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   auto fusion = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion"));
-  std::vector<std::string> visited_nodes;
   auto result =
       HloFindIf(fusion->GetRoots(), *fusion, [&](HloInstructionAdaptor node) {
         return node.opcode() == HloOpcode::kMultiply;
@@ -239,7 +238,6 @@ TEST_F(HloTraversalTest, NotFound) {
   auto module = ParseAndReturnVerifiedModule(kTestModule).value();
   auto fusion = HloFusionAdaptor::ForInstruction(
       module->entry_computation()->GetInstructionWithName("fusion"));
-  std::vector<std::string> visited_nodes;
   auto result = HloFindIf(fusion->GetRoots(), *fusion,
                           [&](HloInstructionAdaptor node) { return false; });
   ASSERT_EQ(result, std::nullopt);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 2d083f4bc33b68..6265c5845d036c 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -519,7 +519,7 @@ absl::StatusOr<bool> CanEmitFusedDynamicUpdateSliceInPlaceForGpu(
     // be necessary for the shape to be the same for all the dynamic slice
     // updates. Note that this equality check purposefully ignores the element
     // type.
-    if (dus->operand(1)->shape() != update_shape) {
+    if (dus->update()->shape() != update_shape) {
       return false;
     }
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index cc79e4cd3c8266..afbf212bad0369 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/name_uniquer.h"
 #include "xla/stream_executor/device_description.h"
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index 26b73fa8a84bb0..427ac4dbd6b4fa 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <climits>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <limits>
@@ -26,6 +27,7 @@ limitations under the License.
 #include <string>
 #include <system_error>  // NOLINT(build/c++11): required to interface with LLVM
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "nvidia/include/NVGPUToLLVM/NVGPUToLLVMPass.h"
@@ -47,11 +49,14 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"  // from @llvm-project
@@ -59,6 +64,7 @@ limitations under the License.
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -68,6 +74,7 @@ limitations under the License.
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -93,17 +100,22 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/service/dump.h"
+#include "xla/service/gpu/fusions/mlir/elemental_hlo_to_mlir.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
@@ -112,6 +124,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
@@ -258,7 +271,10 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
     return Cast(b, b.create<ma::ExtFOp>(fp32_ty, value), dst_element_ty);
   }
   if (dst_element_ty.isBF16()) {
-    return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
+    // S8 -> BF16 is directly supported and doesn't need to go through f32.
+    if (!src_element_ty.isInteger(8)) {
+      return b.create<ma::TruncFOp>(dst_ty, Cast(b, value, b.getF32Type()));
+    }
   }
 
   // float => float
@@ -335,8 +351,7 @@ Value Compare(ImplicitLocOpBuilder& b, ValueRange values,
 
 Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>() &&
-      device_info.cuda_compute_capability().IsAtLeastAmpere()) {
+  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
     return b.create<ma::MaximumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs >= rhs) ? lhs : rhs
@@ -357,8 +372,7 @@ Value Maximum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
 
 Value Minimum(ImplicitLocOpBuilder& b, const se::DeviceDescription& device_info,
               ValueRange values) {
-  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>() &&
-      device_info.cuda_compute_capability().IsAtLeastAmpere()) {
+  if (mlir::getElementTypeOrSelf(values[0]).isa<mlir::FloatType>()) {
     return b.create<ma::MinimumFOp>(values);
   }
   // logic: isNaN(lhs) || (!isNan(rhs) && lhs <= rhs) ? lhs : rhs
@@ -401,9 +415,11 @@ Value AddPtr(ImplicitLocOpBuilder& b, Value ptr, Value offset) {
   return b.create<mt::AddPtrOp>(ptr.getType(), ptr, offset);
 }
 
-Value EmitElementwise(ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
-                      const se::DeviceDescription& device_info,
-                      const HloInstruction& hlo, ValueRange inputs) {
+absl::StatusOr<Value> EmitElementwise(ImplicitLocOpBuilder& b,
+                                      absl::string_view libdevice_path,
+                                      const se::DeviceDescription& device_info,
+                                      const HloInstruction& hlo,
+                                      ValueRange inputs) {
   if (mlir::getElementTypeOrSelf(inputs[0]).isF32() ||
       mlir::getElementTypeOrSelf(inputs[0]).isF64()) {
     auto dev_fn_id = GetTargetDeviceFunctionID(hlo.opcode());
@@ -475,7 +491,8 @@ Value EmitElementwise(ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
                   mlir::mhlo::ComparisonDirection::NE),
           inputs[1], inputs[2]);
     default:
-      LOG(FATAL) << "Unsupported operation " << hlo.ToString();
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported elementwise operation ", hlo.ToString()));
   }
 }
 
@@ -528,12 +545,12 @@ struct DimProperties {
   int split_value;
 };
 
-Value EmitBroadcast(ImplicitLocOpBuilder& b,
-                    const TritonFusionAnalysis* analysis,
-                    TritonFusionAnalysis::Scope scope,
-                    absl::Span<const DimProperties> tiled_dimensions,
-                    const HloInstruction& broadcast, Value input) {
-  CHECK(analysis != nullptr);
+absl::StatusOr<Value> EmitBroadcast(
+    ImplicitLocOpBuilder& b, const TritonFusionAnalysis* analysis,
+    TritonFusionAnalysis::Scope scope,
+    absl::Span<const DimProperties> tiled_dimensions,
+    const HloInstruction& broadcast, Value input) {
+  TF_RET_CHECK(analysis != nullptr);
   std::vector<int64_t> out_shape;
   for (const DimProperties& dim : tiled_dimensions) {
     const TensorIterationSpec::DimIterationSpec* spec =
@@ -586,12 +603,13 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
 
   // At the moment, we should only emit a full reduction over the last axis of
   // a single input.
-  CHECK_EQ(hlo_reduce.operand_count(), 2);
-  CHECK_EQ(hlo_reduce.dimensions().size(), 1);
-  CHECK_EQ(hlo_reduce.dimensions(0), hlo_reduce.operand(0)->shape().rank() - 1);
+  TF_RET_CHECK(hlo_reduce.operand_count() == 2);
+  TF_RET_CHECK(hlo_reduce.dimensions().size() == 1);
+  TF_RET_CHECK(hlo_reduce.dimensions(0) ==
+               hlo_reduce.operand(0)->shape().rank() - 1);
   const int block_row = input_shape.back();
   const int row_len = hlo_reduce.operand(0)->shape().dimensions_minor(0);
-  CHECK_GE(block_row, row_len);
+  TF_RET_CHECK(block_row >= row_len);
 
   const HloInstruction* operand = hlo_reduce.operand(1);
   Value neutral;
@@ -599,14 +617,14 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   // We assume that the reduction value was input as a constant, or in the case
   // of a data type affected by float normalization, a convert of a constant.
   if (operand->opcode() == HloOpcode::kConvert) {
-    CHECK_EQ(operand->operand(0)->opcode(), HloOpcode::kConstant);
-    CHECK_EQ(operand->operand(0)->shape().element_type(), BF16);
+    TF_RET_CHECK(operand->operand(0)->opcode() == HloOpcode::kConstant);
+    TF_RET_CHECK(operand->operand(0)->shape().element_type() == BF16);
     PrimitiveType dest_ty = operand->shape().element_type();
-    CHECK_EQ(dest_ty, F32);
+    TF_RET_CHECK(dest_ty == F32);
     neutral = EmitConstant(b, *operand->operand(0));
     neutral = Cast(b, neutral, TritonType(b, dest_ty));
   } else {
-    CHECK_EQ(operand->opcode(), HloOpcode::kConstant);
+    TF_RET_CHECK(operand->opcode() == HloOpcode::kConstant);
     neutral = EmitConstant(b, *operand);
   }
 
@@ -645,16 +663,17 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
          reduction_computation->MakeInstructionPostOrder()) {
       if (instr->opcode() == HloOpcode::kParameter) {
         int parameter_number = instr->parameter_number();
-        CHECK_LT(parameter_number, 2);
-        CHECK(region_values
-                  .insert({instr, reducer->getArgument(parameter_number)})
-                  .second);
+        TF_RET_CHECK(parameter_number < 2);
+        TF_RET_CHECK(
+            region_values
+                .insert({instr, reducer->getArgument(parameter_number)})
+                .second);
       } else {
         to_emit.push_back(instr);
       }
     }
 
-    CHECK(!to_emit.empty());
+    TF_RET_CHECK(!to_emit.empty());
 
     b.setInsertionPointToStart(reducer);
     TF_ASSIGN_OR_RETURN(
@@ -678,6 +697,176 @@ absl::StatusOr<Value> EmitReduce(ImplicitLocOpBuilder& b,
   return Cast(b, result, TritonType(b, hlo_reduce.shape().element_type()));
 }
 
+// Emit code corresponding to a fusion instruction somehow nested within the
+// initial Triton fusion. This can happen when we carry around auxiliary
+// computations, e.g. with reduces. Since we are emitting a single Triton
+// fusion, we simply flatten the fusion inside the computation.
+//
+// TODO(b/331413981): get rid of this special handling once this is solved.
+absl::StatusOr<Value> EmitNestedFusion(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const HloFusionInstruction& fusion_instruction,
+    absl::flat_hash_map<const HloInstruction*, Value>& values) {
+  // TODO(b/331402498): revisit the order of scope once we completely deprecate
+  // Triton fusion analysis.
+  const HloComputation* fusion_computation =
+      fusion_instruction.fused_instructions_computation();
+
+  absl::flat_hash_map<const HloInstruction*, Value> region_values;
+
+  std::vector<const HloInstruction*> to_emit;
+  for (const HloInstruction* instr :
+       fusion_computation->MakeInstructionPostOrder()) {
+    if (instr->opcode() == HloOpcode::kParameter) {
+      int64_t parameter_number = instr->parameter_number();
+      auto it = values.find(fusion_instruction.operand(parameter_number));
+      TF_RET_CHECK(it != values.end());
+      TF_RET_CHECK(region_values.insert({instr, it->second}).second);
+    } else {
+      to_emit.push_back(instr);
+    }
+  }
+
+  TF_RET_CHECK(to_emit.back() == fusion_computation->root_instruction());
+
+  return EmitScope(b, libdevice_path, device_info, /*analysis=*/nullptr,
+                   TritonFusionAnalysis::Scope::OUTPUT, {}, to_emit,
+                   region_values);
+}
+
+// TODO(b/331332678): Add unit tests to target this function specifically.
+Value EmitTiledBroadcast(
+    ImplicitLocOpBuilder& b, const SymbolicTileAnalysis& analysis,
+    const TiledHloInstruction& tiled_broadcast,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  auto input_tile_shape = analysis.TileSizes(*tiled_broadcast.operands[0]);
+  auto output_tile_shape = analysis.TileSizes(tiled_broadcast);
+
+  Value expanded_input = values[tiled_broadcast.operands[0]];
+
+  // Returns true if `dim_id` is broadcasted.
+  auto is_broadcasted_dim = [&](int64_t dim_id) {
+    return !llvm::is_contained(tiled_broadcast.hlo->dimensions(), dim_id);
+  };
+
+  // The loop below iterates over output dimensions and tracks matching dims in
+  // input_tile_shape and expended_input value.
+  // `input_dim_id != expanded_input_dim_id`, because size-1 dims are present in
+  // the input tile shape, but not in the MLIR value. Triton doesn't like size-1
+  // dims, so they are inserted only for dimensions that will be broadcasted.
+  int64_t input_dim_id = 0;
+  int64_t expanded_input_dim_id = 0;
+  for (size_t output_dim_id = 0; output_dim_id < output_tile_shape.size();
+       ++output_dim_id) {
+    if (is_broadcasted_dim(output_dim_id)) {
+      // The dim is broadcasted in the original instruction, but tiled to 1 in
+      // this case. Nothing to broadcast.
+      if (output_tile_shape[output_dim_id] == 1) continue;
+
+      // Expand dim for broadcast.
+      expanded_input =
+          b.create<mt::ExpandDimsOp>(expanded_input, expanded_input_dim_id);
+      ++expanded_input_dim_id;
+    } else {
+      // The dim is not broadcasted. Validate that it's equal in the input and
+      // output tile.
+      CHECK_EQ(input_tile_shape[input_dim_id],
+               output_tile_shape[output_dim_id]);
+      ++input_dim_id;
+
+      // Size-1 dims are not present in the tensor type.
+      if (output_tile_shape[output_dim_id] != 1) {
+        ++expanded_input_dim_id;
+      }
+    }
+  }
+
+  SmallVector<int64_t> padded_output_tile_shape;
+  padded_output_tile_shape.reserve(output_tile_shape.size());
+
+  for (int64_t tile_dim : output_tile_shape) {
+    if (tile_dim != 1) {
+      padded_output_tile_shape.push_back(llvm::PowerOf2Ceil(tile_dim));
+    }
+  }
+
+  return Broadcast(b, expanded_input.cast<TensorValue>(),
+                   padded_output_tile_shape);
+}
+
+absl::StatusOr<Value> EmitTiledHloInstruction(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const SymbolicTileAnalysis& analysis, const TiledHloInstruction& tiled_hlo,
+    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
+        emit_param_load_fn,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  const HloInstruction* hlo = tiled_hlo.hlo;
+
+  if (hlo->opcode() == HloOpcode::kParameter) {
+    return emit_param_load_fn(tiled_hlo);
+  }
+
+  if (hlo->opcode() == HloOpcode::kConstant &&
+      ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+    // Splat makes it a tensor to avoid type mismatches.
+    return Splat(b, EmitConstant(b, *hlo), {});
+  }
+
+  if (hlo->opcode() == HloOpcode::kBroadcast) {
+    return EmitTiledBroadcast(b, analysis, tiled_hlo, values);
+  }
+
+  if (hlo->opcode() == HloOpcode::kReduce) {
+    return EmitReduce(b, libdevice_path, device_info, *hlo,
+                      values[tiled_hlo.operands[0]]);
+  }
+
+  if (hlo->IsElementwise()) {
+    std::vector<Value> operands;
+    operands.reserve(hlo->operands().size());
+
+    for (const TiledHloInstruction* operand : tiled_hlo.operands) {
+      operands.push_back(values[operand]);
+    }
+    return EmitElementwise(b, libdevice_path, device_info, *hlo, operands);
+  }
+
+  if (hlo->opcode() == HloOpcode::kTranspose ||
+      hlo->opcode() == HloOpcode::kSlice || hlo->opcode() == HloOpcode::kPad) {
+    // All these are currently supported only as operations on indices
+    // which are pushed to loads and stores. No operations on tiles are
+    // performed here.
+    return values[tiled_hlo.operands[0]];
+  }
+
+  return absl::UnimplementedError(
+      absl::StrCat("Unsupported opcode: ", hlo->opcode()));
+}
+
+// Emit sequence of instructions using compatible tiling ordered producers
+// before consumers.
+absl::StatusOr<Value> EmitTiledScope(
+    ImplicitLocOpBuilder& b, absl::string_view libdevice_path,
+    const se::DeviceDescription& device_info,
+    const SymbolicTileAnalysis& analysis,
+    std::function<absl::StatusOr<Value>(const TiledHloInstruction&)>
+        emit_param_load_fn,
+    absl::flat_hash_map<const TiledHloInstruction*, Value>& values) {
+  for (const auto& tiled_hlo : analysis.GetTiledHloInstructions()) {
+    TF_ASSIGN_OR_RETURN(
+        Value result,
+        EmitTiledHloInstruction(b, libdevice_path, device_info, analysis,
+                                *tiled_hlo, emit_param_load_fn, values));
+    TF_RET_CHECK(values.insert({tiled_hlo.get(), result}).second)
+        << tiled_hlo->hlo->ToString();
+    VLOG(8) << "Emitted "
+            << tiled_hlo->hlo->ToString(HloPrintOptions::ShortParsable());
+  }
+  return values[analysis.GetRoot()];
+}
+
 // Emit sequence of instructions using compatible tiling ordered producers
 // before consumers.
 absl::StatusOr<Value> EmitScope(
@@ -703,18 +892,20 @@ absl::StatusOr<Value> EmitScope(
       // Splat makes it a tensor to avoid type mismatches.
       result = Splat(b, EmitConstant(b, *hlo), {});
     } else if (hlo->opcode() == HloOpcode::kBroadcast) {
-      result = EmitBroadcast(b, analysis, scope, tiled_dimensions, *hlo,
-                             values[hlo->operand(0)]);
+      TF_ASSIGN_OR_RETURN(
+          result, EmitBroadcast(b, analysis, scope, tiled_dimensions, *hlo,
+                                values[hlo->operand(0)]));
     } else if (hlo->opcode() == HloOpcode::kReduce) {
       TF_ASSIGN_OR_RETURN(result, EmitReduce(b, libdevice_path, device_info,
                                              *hlo, values[hlo->operand(0)]));
-    } else if (hlo->IsElementwise()) {
+    } else if (HloInstruction::IsOpElementwise(hlo->opcode())) {
       std::vector<Value> operands;
       operands.reserve(hlo->operands().size());
       for (const HloInstruction* operand : hlo->operands()) {
         operands.push_back(values[operand]);
       }
-      result = EmitElementwise(b, libdevice_path, device_info, *hlo, operands);
+      TF_ASSIGN_OR_RETURN(result, EmitElementwise(b, libdevice_path,
+                                                  device_info, *hlo, operands));
     } else if (hlo->opcode() == HloOpcode::kTuple) {
       TF_RET_CHECK(hlo->IsRoot()) << hlo->ToString();
     } else if (hlo->opcode() == HloOpcode::kBitcast ||
@@ -726,8 +917,14 @@ absl::StatusOr<Value> EmitScope(
       // which are pushed to loads and stores. No operations on tiles are
       // performed here.
       result = values[hlo->operand(0)];
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      const auto* fusion_instruction = ::xla::Cast<HloFusionInstruction>(hlo);
+      TF_ASSIGN_OR_RETURN(result,
+                          EmitNestedFusion(b, libdevice_path, device_info,
+                                           *fusion_instruction, values));
     } else {
-      LOG(FATAL) << hlo->ToString();
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported operation ", hlo->ToString()));
     }
     TF_RET_CHECK(values.insert({hlo, result}).second) << hlo->ToString();
     VLOG(8) << "Emitted " << hlo->ToString(HloPrintOptions::ShortParsable());
@@ -775,10 +972,9 @@ absl::Status CreateTritonPipeline(
   pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
   pm.addPass(mlir::createCSEPass());
 
-  if (cc.IsAtLeastAmpere()) {
-    pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
-                                           config.num_ctas, ccAsInt));
-  }
+  pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
+                                         config.num_ctas, ccAsInt));
+
   if (!cc.IsAtLeastHopper()) {
     pm.addPass(mt::gpu::createPrefetchPass());
   }
@@ -918,8 +1114,9 @@ const TensorIterationSpec::DimIterationSpec* GetLhsNoncontractingSplitSpec(
 //   split-K, batch, non-contracting LHS, non-contracting RHS,
 // where split-K and batch are optional.
 struct MatMulDims {
-  MatMulDims(const TritonGemmConfig& config, const HloDotInstruction& dot,
-             const TritonFusionAnalysis& analysis);
+  static absl::StatusOr<MatMulDims> Create(
+      const TritonGemmConfig& config, const HloDotInstruction& dot,
+      const TritonFusionAnalysis& analysis);
 
   std::optional<int> out_split_k_dim_idx = std::nullopt;
 
@@ -942,6 +1139,9 @@ struct MatMulDims {
   int64_t m;
   int64_t n;
   int64_t k;
+
+ private:
+  MatMulDims() = default;
 };
 
 // Structure for parameters relating to the MatMul launch grid.
@@ -957,85 +1157,90 @@ struct MatMulLaunchConfig {
   mt::ProgramIDDim noncontracting_program_id_dim;
 };
 
-MatMulDims::MatMulDims(const TritonGemmConfig& config,
-                       const HloDotInstruction& dot,
-                       const TritonFusionAnalysis& analysis) {
+/*static*/ absl::StatusOr<MatMulDims> MatMulDims::Create(
+    const TritonGemmConfig& config, const HloDotInstruction& dot,
+    const TritonFusionAnalysis& analysis) {
+  MatMulDims matmul_dims;
   if (config.split_k > 1) {
     // split-k is always the first logical dimension.
-    out_split_k_dim_idx = 0;
+    matmul_dims.out_split_k_dim_idx = 0;
   }
 
   int64_t num_split_k_dims = config.split_k > 1 ? 1 : 0;
   const auto& dims = dot.dot_dimension_numbers();
-  lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
-  lhs_noncontracting_dim_idx =
+  matmul_dims.lhs_contracting_dim_idx = dims.lhs_contracting_dimensions(0);
+  matmul_dims.lhs_noncontracting_dim_idx =
       GetNonContractingDims(dot.operand(0)->shape(),
                             dims.lhs_batch_dimensions(),
                             dims.lhs_contracting_dimensions())
           .value()[0];
-  rhs_contracting_dim_idx = dims.rhs_contracting_dimensions(0);
-  rhs_noncontracting_dim_idx =
+  matmul_dims.rhs_contracting_dim_idx = dims.rhs_contracting_dimensions(0);
+  matmul_dims.rhs_noncontracting_dim_idx =
       GetNonContractingDims(dot.operand(1)->shape(),
                             dims.rhs_batch_dimensions(),
                             dims.rhs_contracting_dimensions())
           .value()[0];
 
   if (dims.lhs_batch_dimensions_size() > num_split_k_dims) {
-    lhs_batch_dim_idx = *dims.lhs_batch_dimensions().rbegin();
-    rhs_batch_dim_idx = *dims.rhs_batch_dimensions().rbegin();
+    matmul_dims.lhs_batch_dim_idx = *dims.lhs_batch_dimensions().rbegin();
+    matmul_dims.rhs_batch_dim_idx = *dims.rhs_batch_dimensions().rbegin();
     // The batch dimension (if present) comes after the split-k dimension (if
     // present, otherwise it's the first dimension).
-    out_batch_dim_idx = num_split_k_dims;
+    matmul_dims.out_batch_dim_idx = num_split_k_dims;
   }
 
   // Logical output dimensions are always ordered as:
   //   split-K, batch, non-contracting LHS, non-contracting RHS,
   // where split-K and batch are optional.
-  out_rhs_noncontracting_dim_idx = dot.shape().rank() - 1;
-  out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
+  matmul_dims.out_rhs_noncontracting_dim_idx = dot.shape().rank() - 1;
+  matmul_dims.out_lhs_noncontracting_dim_idx = dot.shape().rank() - 2;
 
   auto* root = dot.parent()->root_instruction();
-  n = analysis
-          .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                    out_rhs_noncontracting_dim_idx)
-          ->at(0)
-          .count;
+  auto iter_spec =
+      analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
+                        matmul_dims.out_rhs_noncontracting_dim_idx);
+  TF_RET_CHECK(iter_spec != nullptr);
+  matmul_dims.n = iter_spec->at(0).count;
   // Contracting dimension length.
   if (config.split_k > 1 &&
       dot.operand(0)->operand(0)->opcode() == HloOpcode::kPad) {
     // Unpadded LHS shape:  [..., k, ...]
     // Padded LHS shape:    [..., padded_k, ...]
     // Bitcasted LHS shape: [..., split_k, padded_k / split_k, ...]
-    CHECK_EQ(dot.operand(0)->opcode(), HloOpcode::kBitcast);
+    TF_RET_CHECK(dot.operand(0)->opcode() == HloOpcode::kBitcast);
     const Shape& unpadded_lhs_shape =
         dot.operand(0)->operand(0)->operand(0)->shape();
-    k = unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
+    matmul_dims.k =
+        unpadded_lhs_shape.dimensions(dims.lhs_contracting_dimensions(0) - 1);
   } else {
-    k = dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
+    matmul_dims.k =
+        dot.operand(0)->shape().dimensions(dims.lhs_contracting_dimensions(0)) *
         config.split_k;
   }
 
-  auto* lhs_noncontracting_split_spec =
-      GetLhsNoncontractingSplitSpec(analysis, lhs_noncontracting_dim_idx);
+  auto* lhs_noncontracting_split_spec = GetLhsNoncontractingSplitSpec(
+      analysis, matmul_dims.lhs_noncontracting_dim_idx);
   if (lhs_noncontracting_split_spec != nullptr) {
     // Just the fastest-varying part of it if the dimension is split.
-    m = lhs_noncontracting_split_spec->at(0).count;
-    lhs_noncontracting_split = lhs_noncontracting_split_spec->at(1).count;
+    matmul_dims.m = lhs_noncontracting_split_spec->at(0).count;
+    matmul_dims.lhs_noncontracting_split =
+        lhs_noncontracting_split_spec->at(1).count;
   } else {
-    m = analysis
-            .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
-                      out_lhs_noncontracting_dim_idx)
-            ->at(0)
-            .count;
+    matmul_dims.m = analysis
+                        .IterSpec(TritonFusionAnalysis::Scope::OUTPUT, root,
+                                  matmul_dims.out_lhs_noncontracting_dim_idx)
+                        ->at(0)
+                        .count;
   }
 
   // For now split non-contracting and batch are not supported
   // simultaneously because they are implemented via same mechanism.
-  CHECK(
-      !(out_batch_dim_idx.has_value() && lhs_noncontracting_split.has_value()));
+  TF_RET_CHECK(!(matmul_dims.out_batch_dim_idx.has_value() &&
+                 matmul_dims.lhs_noncontracting_split.has_value()));
 
-  CHECK_GE(m, 1);
-  CHECK_GE(n, 1);
+  TF_RET_CHECK(matmul_dims.m >= 1);
+  TF_RET_CHECK(matmul_dims.n >= 1);
+  return std::move(matmul_dims);
 }
 
 MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
@@ -1072,38 +1277,39 @@ MatMulLaunchConfig::MatMulLaunchConfig(const TritonGemmConfig& config,
   }
 }
 
-void ValidateMatMulConfig(const TritonGemmConfig& config,
-                          const HloDotInstruction& dot) {
-  CHECK_GE(config.split_k, 1);
-  CHECK_GE(config.block_m, 16);
-  CHECK_GE(config.block_k, 16);
-  CHECK_GE(config.block_n, 16);
+absl::Status ValidateMatMulConfig(const TritonGemmConfig& config,
+                                  const HloDotInstruction& dot) {
+  TF_RET_CHECK(config.split_k >= 1);
+  TF_RET_CHECK(config.block_m >= 16);
+  TF_RET_CHECK(config.block_k >= 16);
+  TF_RET_CHECK(config.block_n >= 16);
 
   const auto& dims = dot.dot_dimension_numbers();
   int num_batch_dims =
       dims.lhs_batch_dimensions_size() - (config.split_k > 1 ? 1 : 0);
-  CHECK_LE(num_batch_dims, 1);
+  TF_RET_CHECK(num_batch_dims <= 1);
   if (config.split_k > 1) {
     // Split-K dimension has to be the first batch one and have an index
     // just before the contracting one.
     const int lhs_split_k_dim_idx = dims.lhs_contracting_dimensions(0) - 1;
     const int rhs_split_k_dim_idx = dims.rhs_contracting_dimensions(0) - 1;
     // Size of this dimension has to match the split_k value.
-    CHECK_EQ(dims.lhs_batch_dimensions(0), lhs_split_k_dim_idx);
-    CHECK_EQ(dims.rhs_batch_dimensions(0), rhs_split_k_dim_idx);
-    CHECK_EQ(config.split_k,
-             dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
-    CHECK_EQ(config.split_k,
-             dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
+    TF_RET_CHECK(dims.lhs_batch_dimensions(0) == lhs_split_k_dim_idx);
+    TF_RET_CHECK(dims.rhs_batch_dimensions(0) == rhs_split_k_dim_idx);
+    TF_RET_CHECK(config.split_k ==
+                 dot.operand(0)->shape().dimensions(lhs_split_k_dim_idx));
+    TF_RET_CHECK(config.split_k ==
+                 dot.operand(1)->shape().dimensions(rhs_split_k_dim_idx));
   }
 
   // Rely on dot decomposer: there is just one contracting and one
   // non-contracting dimension on each side + batch ones optionally.
-  CHECK_EQ(dims.lhs_contracting_dimensions_size(), 1);
-  CHECK_EQ(dims.rhs_contracting_dimensions_size(), 1);
+  TF_RET_CHECK(dims.lhs_contracting_dimensions_size() == 1);
+  TF_RET_CHECK(dims.rhs_contracting_dimensions_size() == 1);
 
-  CHECK_EQ(dot.operand(0)->shape().rank(),
-           2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
+  TF_RET_CHECK(dot.operand(0)->shape().rank() ==
+               2 + (config.split_k > 1 ? 1 : 0) + num_batch_dims);
+  return absl::OkStatus();
 }
 
 struct Side {
@@ -1121,9 +1327,9 @@ struct Side {
 // } else {
 //   return choices.back();
 // }
-Value EmitMultiSelect(ImplicitLocOpBuilder b, Value index, ValueRange limits,
-                      ValueRange choices) {
-  CHECK_EQ(choices.size() - 1, limits.size());
+absl::StatusOr<Value> EmitMultiSelect(ImplicitLocOpBuilder b, Value index,
+                                      ValueRange limits, ValueRange choices) {
+  TF_RET_CHECK(choices.size() - 1 == limits.size());
   Value result = choices[0];
   for (int i = 0; i < choices.size() - 1; ++i) {
     result = b.create<ma::SelectOp>(
@@ -1281,7 +1487,7 @@ class MatMulEmitterHelper {
         }
         LOG(FATAL) << "Missing dimension.";
       }();
-      CHECK_EQ(bases.size(), hlo->operand_count());
+      TF_RET_CHECK(bases.size() == hlo->operand_count());
 
       concat_boundaries.reserve(hlo->operand_count() - 1);
       for (int i = 0; i < hlo->operand_count() - 1; ++i) {
@@ -1298,16 +1504,16 @@ class MatMulEmitterHelper {
 
       concat_dim_pid_offset =
           b_.create<ma::MulIOp>(properties.pid, Cst32(properties.block_size));
-      base =
-          EmitMultiSelect(b_, concat_dim_pid_offset, concat_boundaries, bases);
+      TF_ASSIGN_OR_RETURN(base, EmitMultiSelect(b_, concat_dim_pid_offset,
+                                                concat_boundaries, bases));
     } else {
       concat_dim_idx = -1;
       base = bases[0];
     }
 
-    auto add_dim = [&](const DimProperties& properties) {
+    auto add_dim = [&](const DimProperties& properties) -> absl::Status {
       if (analysis_.IterSpec(side.scope, hlo, properties.index) == nullptr) {
-        return;
+        return absl::OkStatus();
       }
       Value pid_offset =
           (properties.pid == nullptr)
@@ -1337,13 +1543,19 @@ class MatMulEmitterHelper {
             pid_offset, Cst32(specs.back()->at(0).slice_start)));
         input_bounds.push_back(Cst64(specs.back()->at(0).count));
       }
-      strides.push_back(EmitMultiSelect(b_, concat_dim_pid_offset,
-                                        concat_boundaries, input_strides));
+      TF_ASSIGN_OR_RETURN(Value select_value,
+                          EmitMultiSelect(b_, concat_dim_pid_offset,
+                                          concat_boundaries, input_strides));
+      strides.push_back(select_value);
       if (properties.index == concat_dim_idx) {
-        block_offsets.push_back(
+        TF_ASSIGN_OR_RETURN(
+            select_value,
             EmitMultiSelect(b_, pid_offset, concat_boundaries, input_offsets));
-        bounds.push_back(
+        block_offsets.push_back(select_value);
+        TF_ASSIGN_OR_RETURN(
+            select_value,
             EmitMultiSelect(b_, pid_offset, concat_boundaries, input_bounds));
+        bounds.push_back(select_value);
       } else {
         block_offsets.push_back(pid_offset);
         int64_t count = specs.front()->at(0).count;
@@ -1363,10 +1575,11 @@ class MatMulEmitterHelper {
       tensor_offsets.push_back(Cst32(specs.front()->at(0).slice_start));
       block_dims.push_back(properties.block_size);
       dim_order.emplace(dim_order.begin(), dim_order.size());
+      return absl::OkStatus();
     };
 
     for (const DimProperties& dim : side.tiled_dims) {
-      add_dim(dim);
+      TF_RETURN_IF_ERROR(add_dim(dim));
     }
 
     int64_t offset_batch = 0;
@@ -1376,8 +1589,9 @@ class MatMulEmitterHelper {
     // Return the batch stride of the HLO passed as a parameter. If the
     // parameter HLO has no batch dimension, a zero stride is returned.
     // Also sets offset_batch and updates has_batch_offset as a side effect.
-    auto get_batch_stride = [this, &side, &offset_batch, &has_batch_offset](
-                                const HloInstruction* hlo_param) -> Value {
+    auto get_batch_stride =
+        [this, &side, &offset_batch, &has_batch_offset](
+            const HloInstruction* hlo_param) -> absl::StatusOr<Value> {
       int64_t stride_batch = 0;
       if (side.scope != TritonFusionAnalysis::Scope::RHS &&
           dims_.lhs_noncontracting_split) {
@@ -1396,7 +1610,7 @@ class MatMulEmitterHelper {
                 spec->at(0).stride *
                 (spec->at(0).count / *dims_.lhs_noncontracting_split);
           }
-          CHECK_NE(stride_batch, 0);
+          TF_RET_CHECK(stride_batch != 0);
         }
       } else if (side.batch_dim_idx.has_value()) {
         const TensorIterationSpec::DimIterationSpec* spec =
@@ -1404,7 +1618,7 @@ class MatMulEmitterHelper {
         if (spec != nullptr) {
           stride_batch = spec->at(0).stride;
           offset_batch = spec->at(0).slice_start;
-          CHECK_NE(stride_batch, 0);
+          TF_RET_CHECK(stride_batch != 0);
         }
       }
 
@@ -1416,12 +1630,14 @@ class MatMulEmitterHelper {
       std::vector<Value> batch_strides;
       batch_strides.reserve(hlo->operands().size());
       for (const HloInstruction* operand : hlo->operands()) {
-        batch_strides.push_back(get_batch_stride(operand));
+        TF_ASSIGN_OR_RETURN(Value op_stride, get_batch_stride(operand));
+        batch_strides.push_back(op_stride);
       }
-      batch_stride = EmitMultiSelect(b_, concat_dim_pid_offset,
-                                     concat_boundaries, batch_strides);
+      TF_ASSIGN_OR_RETURN(batch_stride,
+                          EmitMultiSelect(b_, concat_dim_pid_offset,
+                                          concat_boundaries, batch_strides));
     } else {
-      batch_stride = get_batch_stride(hlo);
+      TF_ASSIGN_OR_RETURN(batch_stride, get_batch_stride(hlo));
     }
 
     // Avoid generating logic to compute batch offset if unnecessary.
@@ -1438,7 +1654,7 @@ class MatMulEmitterHelper {
       const TensorIterationSpec::DimIterationSpec* spec = analysis_.IterSpec(
           TritonFusionAnalysis::Scope::OUTPUT, hlo, *dims_.out_split_k_dim_idx);
       if (spec != nullptr) {
-        CHECK(pid_k != nullptr);
+        TF_RET_CHECK(pid_k != nullptr);
         base = AddPtr(b_, base,
                       b_.create<ma::MulIOp>(ConvertScalar(pid_k),
                                             Cst(spec->at(0).stride)));
@@ -1486,16 +1702,17 @@ class MatMulEmitterHelper {
 
 }  // namespace
 
-LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
-                                           const HloFusionAdaptor& fusion,
-                                           const TritonGemmConfig& config) {
+absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
+    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
+    const TritonGemmConfig& config) {
   auto dot = HloFindIf(fusion.GetRoots(), fusion, [](auto node) {
     return node.opcode() == HloOpcode::kDot;
   });
-  CHECK(dot != std::nullopt);
+  TF_RET_CHECK(dot != std::nullopt);
   const auto& dot_instr =
       *static_cast<const HloDotInstruction*>(&dot->instruction());
-  MatMulDims dims(config, dot_instr, analysis);
+  TF_ASSIGN_OR_RETURN(MatMulDims dims,
+                      MatMulDims::Create(config, dot_instr, analysis));
   MatMulLaunchConfig launch_config(config, dot_instr, dims);
   return launch_config.launch_dims;
 }
@@ -1692,9 +1909,7 @@ bool Is6xBfloat16MatMul(const HloDotInstruction* dot_instr,
   if (algorithm == PrecisionConfig::ALG_UNSET) {
     const HloModule* hlo_module = dot_instr->GetModule();
     Type f32 = builder.getF32Type();
-    // BF16 datatype is not supported before Ampere.
-    return device_info.cuda_compute_capability().IsAtLeastAmpere() &&
-           hlo_module->config()
+    return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_6way_gemm() &&
            dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
@@ -1714,9 +1929,7 @@ bool Is3xBfloat16MatMul(const HloDotInstruction* dot_instr,
   if (algorithm == PrecisionConfig::ALG_UNSET) {
     const HloModule* hlo_module = dot_instr->GetModule();
     Type f32 = builder.getF32Type();
-    // BF16 datatype is not supported before Ampere.
-    return device_info.cuda_compute_capability().IsAtLeastAmpere() &&
-           hlo_module->config()
+    return hlo_module->config()
                .debug_options()
                .xla_gpu_enable_bf16_3way_gemm() &&
            dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
@@ -1726,6 +1939,43 @@ bool Is3xBfloat16MatMul(const HloDotInstruction* dot_instr,
   return algorithm == PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3;
 }
 
+// This is a heuristic that serves as a proxy for register usage and code size.
+//
+// We have noticed that tilings with very long LLVM IR code are both slow to
+// compile and slow to run. This can be for example due to register spills. So
+// we should skip these tilings to save time. But it's better to skip them
+// before the LLVM IR is generated. To do that, we came up with a formula that
+// strongly correlates with the LLVM IR size. The formula is the size of the two
+// input and the output thread block tiles divided by the number of warps. We
+// read https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/ as a
+// reference, and found the formula by trial and error.
+//
+// To regenerate the limit, we have to run an exhaustive search on all tilings
+// for a few different HLOs, printing the runtimes and the heuristic values.
+//
+// From that, we can find a limit, such that all tilings within alpha *
+// optimal_runtime have a heuristic value less than or equal to the limit.
+//
+// In our measurements, all tilings which were within 1.13 * optimal_runtime had
+// a complexity_heuristic_value <= kComplexityHeuristicLimit.
+//
+// See go/tiling-heuristic for more details.
+absl::Status CheckGemmTilingComplexityHeuristic(
+    const TritonGemmConfig& config) {
+  constexpr int64_t kComplexityHeuristicLimit = 9000;
+  int64_t complexity_heuristic_value =
+      (config.block_m * config.block_n +
+       (config.block_m + config.block_n) * config.block_k) /
+      config.num_warps;
+  VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
+  if (complexity_heuristic_value > kComplexityHeuristicLimit) {
+    return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
+                             complexity_heuristic_value,
+                             kComplexityHeuristicLimit);
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
@@ -1736,9 +1986,11 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         const HloComputation* computation,
                         mlir::triton::FuncOp fn,
                         const TritonGemmConfig& config) {
+  TF_RETURN_IF_ERROR(CheckGemmTilingComplexityHeuristic(config));
+
   const HloDotInstruction* dot_instr = DynCast<HloDotInstruction>(
       hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot));
-  CHECK(!dot_instr->sparse_operands());
+  TF_RET_CHECK(!dot_instr->sparse_operands());
   // Use 32-bit indexing if addressing any of the inputs or the output (which
   // could grow if split_k is set) does not cross the INT_MAX boundary.
   // Otherwise, fall back to 64-bit indexing, which is slower.
@@ -1749,7 +2001,7 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   Type index_ty = builder.getIntegerType(use_64bit_indexing ? 64 : 32);
 
   const HloInstruction* root = dot_instr->parent()->root_instruction();
-  CHECK(!root->shape().IsTuple());
+  TF_RET_CHECK(!root->shape().IsTuple());
 
   // We'll be creating a lot of instructions from a single dot, use an
   // implicit loc builder so we don't have to pass around the location all the
@@ -1758,13 +2010,14 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   ImplicitLocOpBuilder b(loc, builder);
   Type i32_ty = b.getI32Type();
 
-  ValidateMatMulConfig(config, *dot_instr);
+  TF_RETURN_IF_ERROR(ValidateMatMulConfig(config, *dot_instr));
   const int split_k = config.split_k;
   const int block_m = config.block_m;
   const int block_k = config.block_k;
   const int block_n = config.block_n;
 
-  const MatMulDims dims(config, *dot_instr, analysis);
+  TF_ASSIGN_OR_RETURN(const MatMulDims dims,
+                      MatMulDims::Create(config, *dot_instr, analysis));
   const MatMulLaunchConfig launch_config(config, *dot_instr, dims);
   VLOG(6) << analysis.ToString();
 
@@ -1833,6 +2086,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     iter_args_next.reserve(iter_args.size());
     absl::flat_hash_map<const HloInstruction*, Value> values_lhs;
     absl::flat_hash_map<const HloInstruction*, Value> values_rhs;
+    bool has_8_bit_input = false;
+
     // Load tiles of all parameters of LHS and RHS scopes and advance pointers.
     for (int i = 0; i < iter_args.size() - 1; ++i) {
       const bool is_lhs =
@@ -1850,6 +2105,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
         param_value = Cast(b, param_value, param_ty);
       }
 
+      if (param_ty.getIntOrFloatBitWidth() <= 8) {
+        has_8_bit_input = true;
+      }
+
       CHECK(values.insert({param_hlo, param_value}).second);
       SmallVector<Value> increments;
       for (const DimProperties& dim : side.tiled_dims) {
@@ -1911,6 +2170,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       dot_input_rhs = apply_mask(1, dot_input_rhs);
     }
 
+    // TODO(b/320659359) Allow TF32 for 8-bit types with F32.
+    bool has_convert_8_bit_to_f32 =
+        has_8_bit_input && getElementTypeOrSelf(dot_input_lhs).isF32();
+
     const HloModule* hlo_module = dot_instr->GetModule();
     if (hlo_module->config().debug_options().xla_gpu_enable_bf16_3way_gemm() &&
         hlo_module->config().debug_options().xla_gpu_enable_bf16_6way_gemm()) {
@@ -1921,7 +2184,6 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
     Value accumulator_next;
     if (Is6xBfloat16MatMul(dot_instr, b, dot_input_lhs, dot_input_rhs,
                            device_info)) {
-      CHECK(device_info.cuda_compute_capability().IsAtLeastAmpere());
       absl::StatusOr<Value> accumulator_next_or = Emit6xBfloat16MatMul(
           b, dot_input_lhs, dot_input_rhs, iter_args.back());
       TF_CHECK_OK(accumulator_next_or.status());
@@ -1938,10 +2200,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
       // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
       // lower precision than the output type. The change was introduced here:
       // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-      accumulator_next =
-          b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs, iter_args.back(),
-                              /*allowTF32=*/IsTf32Allowed(dot_instr),
-                              /*maxNumImpreciseAcc=*/0);
+      accumulator_next = b.create<mt::DotOp>(
+          dot_input_lhs, dot_input_rhs, iter_args.back(),
+          /*allowTF32=*/IsTf32Allowed(dot_instr) && !has_convert_8_bit_to_f32,
+          /*maxNumImpreciseAcc=*/0);
     }
     iter_args_next.push_back(accumulator_next);
 
@@ -1957,7 +2219,8 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
 
   for (const Side& side : {lhs, rhs}) {
     for (const HloInstruction* input : ScopeInputs(analysis, side.scope)) {
-      CHECK(iter_args_to_inputs.insert({iter_args.size(), input}).second);
+      TF_RET_CHECK(
+          iter_args_to_inputs.insert({iter_args.size(), input}).second);
       TF_ASSIGN_OR_RETURN(Value tensor_ptr,
                           emitter.EmitTensorPointer(
                               input, side, GetArguments(fn, *input), pid_k,
@@ -1988,10 +2251,10 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
           Value tensor_pointer,
           emitter.EmitTensorPointer(input, out, GetArguments(fn, *input), pid_k,
                                     boundary_checks));
-      CHECK(values_out
-                .insert({input,
-                         EmitParameterLoad(b, tensor_pointer, boundary_checks)})
-                .second);
+      TF_RET_CHECK(values_out
+                       .insert({input, EmitParameterLoad(b, tensor_pointer,
+                                                         boundary_checks)})
+                       .second);
     }
     TF_RETURN_IF_ERROR(EmitScope(b, libdevice_path, device_info, &analysis,
                                  TritonFusionAnalysis::Scope::OUTPUT,
@@ -2017,6 +2280,216 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
   return absl::OkStatus();
 }
 
+// Computes indexing map from program id into the tile offset for the given
+// shape and tile sizes.
+IndexingMap ComputeProgramIdToOutputTileIndexing(
+    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context) {
+  CHECK_EQ(dimensions.size(), tile_sizes.size());
+
+  int num_tiles = 1;
+  std::vector<int64_t> outer_loop_bounds;
+  outer_loop_bounds.reserve(dimensions.size());
+  for (auto [dim_size, tile_size] : llvm::zip(dimensions, tile_sizes)) {
+    int num_tiles_per_dim = (dim_size + tile_size - 1) / tile_size;
+
+    num_tiles *= num_tiles_per_dim;
+    outer_loop_bounds.push_back(num_tiles_per_dim);
+  }
+
+  mlir::AffineExpr program_id = mlir::getAffineDimExpr(0, mlir_context);
+
+  // Delinearize the block id.
+  auto tile_exprs =
+      DelinearizeIndex(outer_loop_bounds, program_id, mlir_context);
+
+  // Scale each index by the tile size to produce tile offset.
+  for (auto [tile_expr, tile_size] : llvm::zip(tile_exprs, tile_sizes)) {
+    tile_expr = tile_expr * tile_size;
+  }
+
+  return IndexingMap::FromTensorSizes(
+      mlir::AffineMap::get(
+          /*dimCount=*/1, /*symbolCount=*/0, tile_exprs, mlir_context),
+      /*dim_upper_bounds=*/{num_tiles}, /*symbol_upper_bounds=*/{});
+}
+
+// Computes the base pointer offset for the given pid and shape.
+// `tile_offset_indexing` is a mapping from
+// (program_id) -> [tile_offset0, ..., tile_offsetN]
+StatusOr<Value> ComputeBasePtrOffset(ImplicitLocOpBuilder b, Value pid,
+                                     const Shape& shape,
+                                     const IndexingMap& tile_offset_indexing) {
+  ArrayRef<mlir::AffineExpr> dimension_exprs =
+      tile_offset_indexing.GetAffineMap().getResults();
+
+  mlir::AffineExpr linear_index =
+      mlir::getAffineConstantExpr(0, b.getContext());
+  int64_t stride = 1;
+  for (int i : shape.layout().minor_to_major()) {
+    linear_index = linear_index + dimension_exprs[i] * stride;
+    stride *= shape.dimensions(i);
+  }
+
+  // A symbol in an indexing map means that to produce on element of output, we
+  // need to read all elements of input in the symbol range. Since this function
+  // computes start of the tile, we need to substitute each symbol with its
+  // lower bound value. We assume here the iteration order is normalized.
+  // TODO(b/330906085): Support cases when tile offsets are not 0.
+  for (const Interval& symbol_bound : tile_offset_indexing.GetSymbolBounds()) {
+    if (symbol_bound.lower != 0) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Symbol lower bound is not zero. ", tile_offset_indexing.ToString()));
+    }
+  }
+
+  std::vector<Value> symbol_lower_bounds(
+      tile_offset_indexing.GetSymbolCount(),
+      b.create<ma::ConstantOp>(b.getIndexAttr(0)));
+
+  return b.create<ma::IndexCastUIOp>(
+      b.getI64Type(), mlir_converter::ApplyAffineExpr(linear_index, pid,
+                                                      symbol_lower_bounds, b));
+}
+
+absl::Status EmitTiledSoftMax(mlir::OpBuilder builder,
+                              absl::string_view libdevice_path,
+                              const se::DeviceDescription& device_info,
+                              SymbolicTileAnalysis* analysis,
+                              const HloComputation* computation,
+                              mlir::triton::FuncOp fn) {
+  mlir::MLIRContext* mlir_context = analysis->GetMLIRContext();
+
+  const HloInstruction* root = computation->root_instruction();
+  auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
+  ImplicitLocOpBuilder b(loc, builder);
+
+  // Assumptions we make about the matcher:
+  //   * matches Softmax "diamonds" on the last axis, along with any number of
+  //     elementwise operations/bitcasts on any edge
+  //   * within a given fusion, every argument to a Softmax diamond has the same
+  //     shape
+  //   * every reduction is on the last axis
+  //   * the last axis of every reduction parameter has the same length
+  //   * reductions only reduce a single operand
+  //   * all the shapes have canonical layout (logical layout = physical layout)
+  //   * the computation has a single output
+  //   * we tile along a single dimension
+
+  const HloInstruction* reduce = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+
+  if (reduce == nullptr) {
+    return absl::InvalidArgumentError("No reduce instruction found.");
+  }
+
+  const Shape& reduce_input_shape = reduce->operand(0)->shape();
+
+  if (reduce->dimensions().size() != 1 ||
+      reduce->dimensions(0) != reduce_input_shape.rank() - 1) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Reduce instruction must reduce inner-most dimension. ",
+                     reduce->ToString()));
+  }
+
+  const Shape& root_shape = computation->root_instruction()->shape();
+  if (!root_shape.IsArray() ||
+      LayoutUtil::IsMonotonicWithDim0Minor(root_shape.layout())) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Root shape is not supported. ", root_shape.ToString()));
+  }
+
+  int row_len = reduce_input_shape.dimensions_minor(0);
+
+  Value pid = b.create<ma::IndexCastUIOp>(
+      b.getIndexType(), b.create<mt::GetProgramIdOp>(mt::ProgramIDDim::X));
+
+  std::vector<int64_t> output_tile_sizes(
+      computation->root_instruction()->shape().rank(), 1);
+  output_tile_sizes.back() = row_len;
+
+  analysis->SetTileSizes(output_tile_sizes);
+
+  IndexingMap program_id_to_output_tile_indexing =
+      ComputeProgramIdToOutputTileIndexing(root_shape.dimensions(),
+                                           output_tile_sizes, mlir_context);
+
+  // block_size must be a power of two.
+  int result_block_size = llvm::PowerOf2Ceil(row_len);
+
+  std::vector<int32_t> boundary_checks;
+  if (result_block_size != row_len) {
+    boundary_checks.push_back(0);
+  }
+
+  // Emits load instructions
+  auto emit_param_load = [&](const TiledHloInstruction& tiled_hlo_instruction)
+      -> absl::StatusOr<Value> {
+    std::vector<Value> tile_sizes, tile_strides, tile_offsets;
+    for (auto [size, stride, offset] :
+         llvm::zip(analysis->TileSizes(tiled_hlo_instruction),
+                   analysis->TileStrides(tiled_hlo_instruction),
+                   analysis->TileOffsets(tiled_hlo_instruction))) {
+      if (size == 1) continue;
+
+      tile_sizes.push_back(CreateConst(b, b.getI64Type(), size));
+      tile_strides.push_back(CreateConst(b, b.getI64Type(), stride));
+      tile_offsets.push_back(CreateConst(b, b.getI32Type(), offset));
+    }
+
+    IndexingMap program_id_to_input_tile_indexing = ComposeIndexingMaps(
+        program_id_to_output_tile_indexing, tiled_hlo_instruction.indexing_map);
+    program_id_to_input_tile_indexing.Simplify(GetIndexingMapForInstruction);
+
+    // Manually compute pointer offset to avoid materialized fully parallel
+    // dimensions in the tile. Current codegen tried to avoid size-1 dims.
+    TF_ASSIGN_OR_RETURN(
+        Value ptr_offset,
+        ComputeBasePtrOffset(b, pid, tiled_hlo_instruction.hlo->shape(),
+                             program_id_to_input_tile_indexing));
+
+    auto fn_arg = fn.getArgument(tiled_hlo_instruction.hlo->parameter_number());
+    auto tile_ptr = AddPtr(b, fn_arg, ptr_offset);
+
+    if (tile_sizes.empty()) {
+      return EmitParameterLoad(b, tile_ptr, boundary_checks);
+    }
+
+    Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
+        /*base=*/tile_ptr,
+        /*shape=*/tile_sizes,
+        /*strides=*/tile_strides,
+        /*offsets=*/tile_offsets,
+        /*tensorShape=*/std::vector<int32_t>{result_block_size},
+        /*order=*/std::vector<int32_t>{0});
+
+    return EmitParameterLoad(b, emitted_tensor, boundary_checks);
+  };
+
+  absl::flat_hash_map<const TiledHloInstruction*, Value> values_out;
+  TF_ASSIGN_OR_RETURN(Value result,
+                      EmitTiledScope(b, libdevice_path, device_info, *analysis,
+                                     emit_param_load, values_out));
+
+  TF_ASSIGN_OR_RETURN(Value ptr_offset,
+                      ComputeBasePtrOffset(b, pid, root_shape,
+                                           program_id_to_output_tile_indexing));
+
+  Value store_tensor = b.create<mt::MakeTensorPtrOp>(
+      /*base=*/AddPtr(b, fn.getArgument(computation->num_parameters()),
+                      ptr_offset),
+      /*shape=*/ValueRange{CreateConst(b, b.getI64Type(), row_len)},
+      /*strides=*/ValueRange{CreateConst(b, b.getI64Type(), 1)},
+      /*offsets=*/ValueRange{CreateConst(b, b.getI32Type(), 0)},
+      /*tensorShape=*/std::vector<int32_t>{result_block_size},
+      /*order=*/std::vector<int32_t>{0});
+
+  b.create<mt::StoreOp>(store_tensor, result, std::vector<int32_t>{0},
+                        mt::CacheModifier::NONE, mt::EvictionPolicy::NORMAL);
+
+  return absl::OkStatus();
+}
+
 absl::Status EmitSoftMax(mlir::OpBuilder builder,
                          absl::string_view libdevice_path,
                          const se::DeviceDescription& device_info,
@@ -2024,6 +2497,15 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
                          const HloComputation* computation,
                          mlir::triton::FuncOp fn,
                          const TritonGemmConfig& config) {
+  SymbolicTileAnalysisOrError symbolic_tile_analysis_or =
+      SymbolicTileAnalysis::AnalyzeComputation(*computation,
+                                               builder.getContext());
+  if (auto* symbolic_tile_analysis =
+          std::get_if<SymbolicTileAnalysis>(&symbolic_tile_analysis_or)) {
+    return EmitTiledSoftMax(builder, libdevice_path, device_info,
+                            symbolic_tile_analysis, computation, fn);
+  }
+
   const HloInstruction* root = computation->root_instruction();
   auto loc = mlir::NameLoc::get(builder.getStringAttr(root->name()));
   ImplicitLocOpBuilder b(loc, builder);
@@ -2046,13 +2528,13 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
   const HloInstruction* reduce = hlo_query::GetFirstInstructionWithOpcode(
       *computation, HloOpcode::kReduce);
 
-  CHECK_NE(reduce, nullptr);
+  TF_RET_CHECK(reduce != nullptr);
 
   Shape reduce_input_shape = reduce->operand(0)->shape();
 
-  CHECK_EQ(reduce->opcode(), HloOpcode::kReduce);
-  CHECK_EQ(reduce->dimensions().size(), 1);
-  CHECK_EQ(reduce->dimensions()[0], reduce_input_shape.rank() - 1);
+  TF_RET_CHECK(reduce->opcode() == HloOpcode::kReduce);
+  TF_RET_CHECK(reduce->dimensions().size() == 1);
+  TF_RET_CHECK(reduce->dimensions()[0] == reduce_input_shape.rank() - 1);
 
   int row_len = reduce_input_shape.dimensions_minor(0);
 
@@ -2086,9 +2568,8 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
                                             param, /*dimension=*/1);
 
     // Make sure only batch and reduce dims are present in tiling
-    CHECK_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, param,
-                               /*dimension=*/2),
-             nullptr);
+    TF_RET_CHECK(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT, param,
+                                   /*dimension=*/2) == nullptr);
 
     if (!reduce_iterspec) {
       // This parameter's broadcast is along the reduce dimension, and so
@@ -2104,8 +2585,8 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
       continue;
     }
 
-    CHECK_NE(reduce_iterspec, nullptr);
-    CHECK_EQ(reduce_iterspec->size(), 1);
+    TF_RET_CHECK(reduce_iterspec != nullptr);
+    TF_RET_CHECK(reduce_iterspec->size() == 1);
 
     // TODO(b/310721908): The below assumes that we tile along a single dim.
     int reduce_dim_len = reduce_iterspec->front().count;
@@ -2119,13 +2600,13 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
 
     // We assume that the reduced axis of this parameter has length row_len.
     // TODO(b/316637896): Relax assumption that param reduce_dim_len == row_len.
-    CHECK_EQ(reduce_dim_len, row_len);
+    TF_RET_CHECK(reduce_dim_len == row_len);
 
     // block_size must be a power of two.
     int block_size = pow(2, ceil(log(reduce_dim_len) / log(2)));
 
     // Verify that this param contains a single contiguous fragment.
-    CHECK_EQ(reduce_iterspec->front().subfragments.size(), 1);
+    TF_RET_CHECK(reduce_iterspec->front().subfragments.size() == 1);
 
     Value emitted_tensor = b.create<mt::MakeTensorPtrOp>(
         /*base=*/AddPtr(b, fn.getArgument(param_idx), base_offset),
@@ -2210,7 +2691,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const HloComputation* hlo_computation,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context) {
-  mlir_context.loadDialect<mt::TritonDialect>();
+  mlir_context.loadDialect<mt::TritonDialect, mlir::arith::ArithDialect,
+                           mlir::affine::AffineDialect>();
+
   mlir::OpBuilder b(&mlir_context);
   auto loc = mlir::NameLoc::get(b.getStringAttr(hlo_computation->name()));
   mlir::OwningOpRef<mlir::ModuleOp> triton_module =
@@ -2246,23 +2729,32 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
 
   b.create<mt::ReturnOp>(loc);
 
+  mlir::PassManager pm(&mlir_context);
+  pm.addPass(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createCSEPass());
+  TF_RET_CHECK(pm.run(triton_module.get()).succeeded());
+
   VLOG(6) << llvm_ir::DumpToString(*triton_module);
   if (DumpingEnabledForHloModule(*hlo_computation->parent())) {
     DumpToFileInDirOrStdout(*hlo_computation->parent(), "triton_ir", "ttir",
                             llvm_ir::DumpToString(*triton_module));
   }
 
-  CHECK(mlir::succeeded(mlir::verify(*triton_module)));
+  TF_RET_CHECK(mlir::succeeded(mlir::verify(*triton_module)));
   return std::move(triton_module);
 }
 
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context) {
+  if (!cc.IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   auto debug_options = GetDebugOptionsFromFlags();
   if (debug_options.xla_gpu_enable_triton_hopper()) {
     // Set environment variables for consumption by Triton.
@@ -2270,43 +2762,6 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     tsl::setenv("ENABLE_PIPELINING", "true", true /*overwrite*/);
   }
 
-  if (fusion_kind == kTritonGemmFusionKind) {
-    // This is a heuristic that serves as a proxy for register usage and code
-    // size.
-    //
-    // We have noticed that tilings with very long LLVM IR code are both slow to
-    // compile and slow to run. This can be for example due to register spills.
-    // So we should skip these tilings to save time. But it's better to skip
-    // them before the LLVM IR is generated. To do that, we came up with a
-    // formula that strongly correlates with the LLVM IR size. The formula is
-    // the size of the two input and the output thread block tiles divided by
-    // the number of warps. We read
-    // https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/ as a
-    // reference, and found the formula by trial and error.
-    //
-    // To regenerate the limit, we have to run an exhaustive search on all
-    // tilings for a few different HLOs, printing the runtimes and the heuristic
-    // values.
-    // From that, we can find a limit, such that all tilings within alpha *
-    // optimal_runtime have a heuristic value less than or equal to the limit.
-    //
-    // In our measurements, all tilings which were within 1.13 * optimal_runtime
-    // had a complexity_heuristic_value <= kComplexityHeuristicLimit.
-    //
-    // See go/tiling-heuristic for more details.
-    constexpr int64_t kComplexityHeuristicLimit = 9000;
-    int64_t complexity_heuristic_value =
-        (config.block_m * config.block_n +
-         (config.block_m + config.block_n) * config.block_k) /
-        config.num_warps;
-    VLOG(2) << "Complexity heuristic: " << complexity_heuristic_value;
-    if (complexity_heuristic_value > kComplexityHeuristicLimit) {
-      return ResourceExhausted("Tiling complexity heuristic exceeded: %d > %d",
-                               complexity_heuristic_value,
-                               kComplexityHeuristicLimit);
-    }
-  }
-
   TF_ASSIGN_OR_RETURN(
       auto triton_module,
       CreateTritonModule(analysis, fn_name, hlo_computation, device_info,
@@ -2329,6 +2784,11 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
     mlir::MLIRContext& mlir_context) {
+  if (!cc.IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   bool should_verify =
       (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
 #ifndef NDEBUG
@@ -2369,6 +2829,9 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
   }
 
+  // Lower affine expressions into arithmetic ops.
+  pm.addPass(mlir::createLowerAffinePass());
+
   mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
   if (!CreateTritonPipeline(pm, cc, config, /*out*/ cluster_info).ok()) {
     return Internal("Failed to create Triton pipeline.");
@@ -2418,8 +2881,9 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   ll_triton_module->setDataLayout(llvm_module->getDataLayout());
   ll_triton_module->setTargetTriple(llvm_module->getTargetTriple());
   // Use override flag because libdevice functions can be present in both.
-  CHECK(!llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
-                                   llvm::Linker::Flags::OverrideFromSrc));
+  TF_RET_CHECK(
+      !llvm::Linker::linkModules(*llvm_module, std::move(ll_triton_module),
+                                 llvm::Linker::Flags::OverrideFromSrc));
   VLogModule(5, *llvm_module);
   if (should_verify) {
     VerifyModule(*llvm_module);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index 801c648fafd91c..96ca55139bb196 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/status.h"
@@ -48,10 +49,16 @@ struct TritonWrapperResult {
   std::optional<se::ClusterDim> cluster_dim;
 };
 
+// Computes indexing map from program id into the tile offset for the given
+// shape and tile sizes.
+IndexingMap ComputeProgramIdToOutputTileIndexing(
+    absl::Span<const int64_t> dimensions, absl::Span<const int64_t> tile_sizes,
+    mlir::MLIRContext* mlir_context);
+
 // Compute the launch dimensions for the given Triton MatMul.
-LaunchDimensions GetMatMulLaunchDimensions(const TritonFusionAnalysis& analysis,
-                                           const HloFusionAdaptor& fusion,
-                                           const TritonGemmConfig& config);
+absl::StatusOr<LaunchDimensions> GetMatMulLaunchDimensions(
+    const TritonFusionAnalysis& analysis, const HloFusionAdaptor& fusion,
+    const TritonGemmConfig& config);
 // Use tiling and execution parameters from 'config'.
 absl::Status EmitMatMul(mlir::OpBuilder b, absl::string_view libdevice_path,
                         const se::DeviceDescription& device_info,
@@ -82,8 +89,7 @@ using TritonIrEmitter = std::function<Status(
 // MatMul and SoftMax above are some such IR generators.
 absl::StatusOr<TritonWrapperResult> TritonWrapper(
     const TritonFusionAnalysis& analysis, absl::string_view fn_name,
-    const HloComputation* hlo_computation, absl::string_view fusion_kind,
-    const se::CudaComputeCapability& cc,
+    const HloComputation* hlo_computation, const se::CudaComputeCapability& cc,
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     llvm::Module* llvm_module, TritonIrEmitter ir_emitter,
     mlir::MLIRContext& mlir_context);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index a23f4b68c48c90..3be360ae554379 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -71,11 +71,6 @@ class MixedTypeTest : public GpuCodegenTest,
 
 TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
   MixTypeParams params = GetParam();
-  if ((params.lhs_ty == BF16 || params.rhs_ty == BF16) &&
-      !GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string hlo_string_template = R"(
 HloModule m
 
@@ -136,7 +131,7 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, MixedTypeTest,
                              //  TritonRewriteTest2Params{F32, F16},
                              //  TritonRewriteTest2Params{F32, BF16},
                              MixTypeParams{S8, BF16, 24, 40, 8},
-                             MixTypeParams{S8, F16, 80, 16, 32},
+                             MixTypeParams{S8, F16, 80, 16, 32, 1e-3, 1e-6},
                              MixTypeParams{F16, F32, 127, 3, 300, 1e-2, 1e-2},
                              MixTypeParams{F16, BF16, 544, 96, 16, 1e-3, 1e-3},
                              MixTypeParams{BF16, F32, 77, 500, 333, 3e-3, 3e-3},
@@ -799,10 +794,6 @@ TEST_P(TritonSoftmaxTest, CanFuseAndEmitExactSoftmax) {
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -887,14 +878,7 @@ ENTRY main {
   const std::string hlo_text = absl::Substitute(
       hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  std::string hlo_ref_template;
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    hlo_ref_template = R"(
-; CHECK-NOT: triton
-)";
-  } else {
-    hlo_ref_template = R"(
+  std::string hlo_ref_template = R"(
 ; CHECK:    ENTRY
 ; CHECK:      %[[P0:.*]] = $0[127,125]{1,0} parameter(0)
 ; CHECK:      ROOT
@@ -902,7 +886,6 @@ ENTRY main {
 ; CHECK-SAME:   kind=kCustom
 ; CHECK-SAME:   __triton_softmax
 )";
-  }
 
   const std::string hlo_ref = absl::Substitute(
       hlo_ref_template, primitive_util::LowercasePrimitiveTypeName(data_type));
@@ -927,12 +910,6 @@ ENTRY main {
 
 TEST_P(TritonSoftmaxTest, CanFuseAndEmitSoftmaxDiamondWithSmallRows) {
   PrimitiveType data_type = GetParam();
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   constexpr absl::string_view kHloTextTemplate = R"(
 HloModule softmax
 min_computation {
@@ -969,12 +946,6 @@ ENTRY main {
 }
 
 TEST_F(TritonSoftmaxTest, CanFuseAndEmitDiamondWithBF16Converts) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text = R"(
 HloModule softmax
 max_computation {
@@ -1016,10 +987,6 @@ TEST_P(
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -1089,12 +1056,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitDiamondWithMultipleBroadcastDimensions) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1151,12 +1112,7 @@ TEST_P(TritonSoftmaxTest,
 
   if (data_type == F16) {
     GTEST_SKIP() << "Exponential op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1220,12 +1176,6 @@ TEST_P(
     CanFuseAndEmitTwoDiamondsWithSecondDiamondProducerEqualToFirstDiamondRoot) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1289,12 +1239,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitDiamondWithTrailingUnaryElementwiseAtTheRoot) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1349,12 +1293,6 @@ ENTRY main {
 TEST_P(TritonSoftmaxTest, CanFuseAndEmitDiamondWithUnaryElementwisePrefix) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1410,12 +1348,6 @@ TEST_P(TritonSoftmaxTest,
        CanFuseAndEmitSoftmaxDiamondWithLastDimensionBitcastAfterReduce) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1469,17 +1401,10 @@ ENTRY main {
                             ErrorSpec(/*aabs=*/tolerance, /*arel=*/tolerance)));
 }
 
-TEST_P(
-    TritonSoftmaxTest,
-    CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectlyForAmpereAndVoltaComputeCapability) {  // NOLINT(whitespace/line_length)
+TEST_P(TritonSoftmaxTest,
+       CanFuseAndEmitConvertInvolvingBF16InputIntoSoftmaxDiamondCorrectly) {
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule softmax
 max_computation {
@@ -1499,8 +1424,7 @@ ENTRY main {
   const std::string hlo_text = absl::Substitute(
       hlo_text_template, primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    const std::string hlo_ref = R"(
+  const std::string hlo_ref = R"(
 ; CHECK:    ENTRY
 ; CHECK:      %[[P0:.*]] = bf16[127,125]{1,0} parameter(0)
 ; CHECK:      ROOT
@@ -1509,23 +1433,7 @@ ENTRY main {
 ; CHECK-SAME:   __triton_softmax
 )";
 
-    MatchOptimizedHlo(hlo_text, hlo_ref);
-  } else {
-    const std::string hlo_ref_template = R"(
-; CHECK:    ENTRY
-; CHECK:      %[[P0:.*]] = bf16[127,125]{1,0} parameter(0)
-; CHECK:      %[[CONVERT:.*]] = $0[127,125]{1,0} convert(%[[P0]])
-; CHECK:      ROOT
-; CHECK-SAME: fusion(%[[CONVERT]])
-; CHECK-SAME:   kind=kCustom
-; CHECK-SAME:   __triton_softmax
-)";
-
-    const std::string hlo_ref =
-        absl::Substitute(hlo_ref_template,
-                         primitive_util::LowercasePrimitiveTypeName(data_type));
-    MatchOptimizedHlo(hlo_text, hlo_ref);
-  }
+  MatchOptimizedHlo(hlo_text, hlo_ref);
 
   float tolerance;
   switch (data_type) {
@@ -1550,12 +1458,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseProducerIntoDiamondWhenBothOperandsAreTheSame) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1612,12 +1514,6 @@ TEST_P(
     CanFuseAndEmitIntermediateBinaryElementwiseWithinDiamondWhenBothOperandsAreTheSame) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1674,12 +1570,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhenBothOperandsAreTheSameBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 max_computation {
@@ -1745,12 +1635,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseConsumerWhereBothOperandsAreTheSameIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -1813,12 +1697,6 @@ TEST_P(
     CanFuseAndEmitTwoBinaryElementwiseWhereBothOperandsAreTheSameBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 max_computation {
@@ -1919,10 +1797,6 @@ TEST_P(TritonSoftmaxTest, CanFuseAndEmitRMSNormDiamond) {
 
   if (data_type == F16) {
     GTEST_SKIP() << "rsqrt op does not support F16.";
-  } else if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                                      se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
   }
 
   const std::string hlo_text_template = R"(
@@ -1989,12 +1863,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 add_computation {
@@ -2058,12 +1926,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheSecondOperandIsASplatConstantBetweenDiamonds) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamonds
 add_computation {
@@ -2127,12 +1989,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseWhereTheFirstOperandIsASplatConstantWithinDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 max_computation {
@@ -2192,12 +2048,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseConsumerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 add_computation {
@@ -2256,12 +2106,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseProducerWhereTheFirstOperandIsASplatConstantIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule fusible_diamond
 add_computation {
@@ -2321,12 +2165,6 @@ TEST_P(
     CanFuseAndEmitBinaryElementwiseOperationWhereOneOperandIsASharedSplatProducerIntoDiamond) {  // NOLINT(whitespace/line_length)
   PrimitiveType data_type = GetParam();
 
-  if (data_type == BF16 && !GetCudaComputeCapability().IsAtLeast(
-                               se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << R"(No BF16 before Ampere. Pre-Ampere BF16 behavior is tested
-    in CanFuseAndEmitFirstSoftmaxDiamond, and in SoftmaxRewriterTritonTest.)";
-  }
-
   const std::string hlo_text_template = R"(
 HloModule nonfusible_diamond
 max_computation {
@@ -2393,17 +2231,16 @@ ENTRY main {
 }
 )";
 
-  // Param order is arbitrary. We test that only param_1 is in the fused root
-  // instruction below.
   const std::string hlo_ref = R"(
 ; CHECK:    ENTRY
 ; CHECK-DAG:    %[[param_0:.*]] = f32[125,127]{1,0} parameter(0)
 ; CHECK-DAG:    %[[param_1:.*]] = f32[127]{0} parameter(1)
 ; CHECK:      ROOT
 ; CHECK-SAME:   f32[125,127]{1,0} fusion
-; CHECK-SAME:   %[[param_1]]
-; CHECK-SAME:   kind=kCustom
-; CHECK-SAME:   triton_softmax
+; CHECK-SAME:    %[[param_0]]
+; CHECK-SAME:    %[[param_1]]
+; CHECK-SAME:          kind=kCustom
+; CHECK-SAME:          triton_softmax
 )";
   MatchOptimizedHlo(hlo_text, hlo_ref);
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 11a7c5bf59e089..9a2b5a4e8b1d25 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -27,11 +28,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "xla/autotuning.pb.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -40,8 +41,8 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/pattern_matcher.h"
@@ -139,9 +140,6 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
       auto module, CreateTritonModule(analysis, "triton_fn", computation,
                                       TestGpuDeviceInfo::RTXA6000DeviceInfo(),
                                       config, emitter, context));
-  mlir::PassManager pm(&context);
-  pm.addPass(mlir::createCanonicalizerPass());
-  TF_RET_CHECK(pm.run(module.get()).succeeded());
 
   std::string out;
   llvm::raw_string_ostream os(out);
@@ -153,6 +151,31 @@ absl::Status TritonFilecheckTest::CreateTritonIrAndFileCheck(
   return absl::OkStatus();
 }
 
+TEST_F(TritonTest, ComputeProgramIdToOutputTileIndexing) {
+  mlir::MLIRContext context;
+
+  auto compute_map = [&](absl::Span<const int64_t> dimensions,
+                         absl::Span<const int64_t> tile_sizes) {
+    return ComputeProgramIdToOutputTileIndexing(dimensions, tile_sizes,
+                                                &context);
+  };
+
+  EXPECT_THAT(compute_map(/*dimensions=*/{9, 17}, /*tile_sizes=*/{5, 10}),
+              MatchIndexingMap(R"(
+    (d0) -> ((d0 floordiv 2) * 5, (d0 mod 2) * 10)
+    domain:
+    d0 in [0, 3]
+  )"));
+
+  EXPECT_THAT(
+      compute_map(/*dimensions=*/{8, 16, 32}, /*tile_sizes=*/{1, 1, 32}),
+      MatchIndexingMap(R"(
+    (d0) -> (d0 floordiv 16, d0 mod 16, 0)
+    domain:
+    d0 in [0, 127]
+  )"));
+}
+
 TEST_F(TritonFilecheckTest, TestGemm) {
   const std::string kHloText = R"(
 HloModule t, is_scheduled=true
@@ -231,16 +254,98 @@ CHECK:        %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TIL
 CHECK:        %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
 CHECK:        scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xi8>, 1>, !tt.ptr<tensor<32x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      }
-CHECK:      %[[TILE_OFFSET_M_OUT:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
-CHECK:      %[[TILE_OFFSET_N_OUT:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_N]]
 CHECK:      %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[C80]], %[[SIZE_M]]], [%[[SIZE_M]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x64xf32>, 1>
-CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_OUT]], %[[TILE_OFFSET_N_OUT]]] : <tensor<16x64xf32>, 1>
+CHECK:      %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x64xf32>, 1>
 CHECK:      tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x64xf32>, 1>, tensor<16x64xf32>
 CHECK:      tt.return
 CHECK:    }
 )"));
 }
 
+TEST_F(TritonFilecheckTest, TestGemmWithTrivialNonContractingDimension) {
+  const std::string kHloText = R"(
+HloModule t, is_scheduled=true
+
+triton_dot {
+  param_0.1 = f32[137,115]{1,0} parameter(0)
+  param_1.1 = f32[1,115]{1,0} parameter(1)
+  ROOT dot = f32[137,1]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+
+  TritonGemmConfig config(16, 16, 32, 1, 1, 1);
+  EXPECT_OK(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-DAG:  %[[ZERO_KN:.*]] = arith.constant dense<0.000000e+00> : tensor<32x16xf32>
+CHECK-DAG:  %[[ZERO_MK:.*]] = arith.constant dense<0.000000e+00> : tensor<16x32xf32>
+CHECK-DAG:  %[[ZERO_MN:.*]] = arith.constant dense<0.000000e+00> : tensor<16x16xf32>
+CHECK-DAG:  %[[SIZE_K:.*]] = arith.constant 115 : i32
+CHECK-DAG:  %[[SIZE_M:.*]] = arith.constant 137 : i64
+CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : i64
+CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : i32
+CHECK-DAG:  %[[C115:.*]] = arith.constant 115 : i64
+CHECK-DAG:  %[[TILE_SIZE_K:.*]] = arith.constant 32 : i32
+CHECK-DAG:  %[[TILE_SIZE_M:.*]] = arith.constant 16 : i32
+CHECK-DAG:  %[[C8:.*]] = arith.constant 8 : i32
+CHECK-DAG:  %[[NUM_TILES_M:.*]] = arith.constant 9 : i32
+CHECK:    %[[PID_NC:.*]] = tt.get_program_id x : i32
+CHECK:    %[[GROUP_ID:.*]] = arith.divsi %[[PID_NC]], %[[C8]]
+CHECK:    %[[FIRST_PID_M:.*]] = arith.muli %[[GROUP_ID]], %[[C8]]
+CHECK:    %[[MAX_M:.*]] = arith.subi %[[NUM_TILES_M]], %[[FIRST_PID_M]]
+CHECK:    %[[CMP:.*]] = arith.cmpi slt, %[[MAX_M]], %[[C8]]
+CHECK:    %[[GROUP_SIZE:.*]] = arith.select %[[CMP]], %[[MAX_M]], %[[C8]]
+CHECK:    %[[PID_M:.*]] = arith.remsi %[[PID_NC]], %[[GROUP_SIZE]]
+CHECK:    %[[TILE_INDEX_M:.*]] = arith.addi %[[FIRST_PID_M]], %[[PID_M]]
+CHECK:    %[[TMP:.*]] = arith.remsi %[[PID_NC]], %[[C8]]
+CHECK:    %[[TILE_INDEX_N:.*]] = arith.divsi %[[TMP]], %[[GROUP_SIZE]]
+CHECK:    %[[TILE_OFFSET_M_LHS:.*]] = arith.muli %[[TILE_INDEX_M]], %[[TILE_SIZE_M]]
+CHECK:    %[[LHS_PTR:.*]] = tt.make_tensor_ptr %[[LHS]]
+CHECK:    %[[LHS_TILE_PTR:.*]] = tt.advance %[[LHS_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[C0]]]
+CHECK:    %[[TILE_OFFSET_N_RHS:.*]] = arith.muli %[[TILE_INDEX_N]], %[[TILE_SIZE_M]]
+CHECK:    %[[RHS_PTR:.*]] = tt.make_tensor_ptr %[[RHS]]
+CHECK:    %[[RHS_TILE_PTR:.*]] = tt.advance %[[RHS_PTR]], [%[[C0]], %[[TILE_OFFSET_N_RHS]]]
+CHECK:    %[[FOR:.*]]:3 = scf.for %[[BLOCK_K:.*]] = %[[C0]] to %[[SIZE_K]] step %[[TILE_SIZE_K]]
+CHECK-SAME:       iter_args(%[[LHS_ITER_PTR:.*]] = %[[LHS_TILE_PTR]], %[[RHS_ITER_PTR:.*]] = %[[RHS_TILE_PTR]], %[[ACC:.*]] = %[[ZERO_MN]])
+CHECK:      %[[LHS_TILE:.*]] = tt.load %[[LHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
+CHECK:      %[[LHS_ITER_PTR_NEXT:.*]] = tt.advance %[[LHS_ITER_PTR]], [%[[C0]], %[[TILE_SIZE_K]]]
+CHECK:      %[[RHS_TILE:.*]] = tt.load %[[RHS_ITER_PTR]] {boundaryCheck = array<i32: 0, 1>
+CHECK:      %[[RHS_ITER_PTR_NEXT:.*]] = tt.advance %[[RHS_ITER_PTR]], [%[[TILE_SIZE_K]], %[[C0]]]
+CHECK:      %[[TILE_K_LIMIT:.*]] = arith.subi %[[SIZE_K]], %[[BLOCK_K]] : i32
+CHECK:      %[[K_TILE_IOTA:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32>
+CHECK:      %[[K_OFFSETS_1K:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 0 : i32} : tensor<32xi32> -> tensor<1x32xi32>
+CHECK:      %[[TILE_K_LIMIT_1K:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<1x32xi32>
+CHECK:      %[[LHS_INBOUNDS_1K:.*]] = arith.cmpi slt, %[[K_OFFSETS_1K]], %[[TILE_K_LIMIT_1K]] : tensor<1x32xi32>
+CHECK:      %[[LHS_INBOUNDS_MK:.*]] = tt.broadcast %[[LHS_INBOUNDS_1K]] : tensor<1x32xi1> -> tensor<16x32xi1>
+CHECK:      %[[LHS_MASKED:.*]] = arith.select %[[LHS_INBOUNDS_MK]], %[[LHS_TILE]], %[[ZERO_MK]]
+CHECK:      %[[K_OFFSETS_K1:.*]] = tt.expand_dims %[[K_TILE_IOTA]] {axis = 1 : i32} : tensor<32xi32> -> tensor<32x1xi32>
+CHECK:      %[[TILE_K_LIMIT_K1:.*]] = tt.splat %[[TILE_K_LIMIT]] : i32 -> tensor<32x1xi32>
+CHECK:      %[[RHS_INBOUNDS_K1:.*]] = arith.cmpi slt, %[[K_OFFSETS_K1]], %[[TILE_K_LIMIT_K1]] : tensor<32x1xi32>
+CHECK:      %[[RHS_INBOUNDS_KN:.*]] = tt.broadcast %[[RHS_INBOUNDS_K1]] : tensor<32x1xi1> -> tensor<32x16xi1>
+CHECK:      %[[RHS_MASKED:.*]] = arith.select %[[RHS_INBOUNDS_KN]], %[[RHS_TILE]], %[[ZERO_KN]] : tensor<32x16xi1>, tensor<32x16xf32>
+CHECK:      %[[ACC_NEXT:.*]] = tt.dot %[[LHS_MASKED]], %[[RHS_MASKED]], %[[ACC]]
+CHECK:      scf.yield %[[LHS_ITER_PTR_NEXT]], %[[RHS_ITER_PTR_NEXT]], %[[ACC_NEXT]] : !tt.ptr<tensor<16x32xf32>, 1>, !tt.ptr<tensor<32x16xf32>, 1>, tensor<16x16xf32>
+CHECK:    }
+
+CHECK:    %[[OUT_PTR:.*]] = tt.make_tensor_ptr %[[OUT]], [%[[SIZE_M]], %[[C1]]], [%[[C1]], %[[C1]]], [%[[C0]], %[[C0]]] {order = array<i32: 1, 0>} : <tensor<16x16xf32>, 1>
+CHECK:    %[[OUT_OFFSET:.*]] = tt.advance %[[OUT_PTR]], [%[[TILE_OFFSET_M_LHS]], %[[TILE_OFFSET_N_RHS]]] : <tensor<16x16xf32>, 1>
+CHECK:    tt.store %[[OUT_OFFSET]], %[[FOR]]#2 {boundaryCheck = array<i32: 0, 1>, cache = 1 : i32, evict = 1 : i32} : !tt.ptr<tensor<16x16xf32>, 1>, tensor<16x16xf32>
+CHECK:    tt.return
+CHECK:  }
+)"));
+}
+
 TEST_F(TritonFilecheckTest, TestSoftmaxEmitterWithSingleParameter) {
   const std::string kHloText = R"(
 HloModule t
@@ -266,9 +371,10 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK:            %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:            arith.extsi %[[PID]] : i32 to i64
+CHECK:            arith.index_castui %[[PID]] : i32 to index
 CHECK:            tt.addptr %[[P0]]
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -318,9 +424,10 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:            %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:            arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:            arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:            %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:            %[[ARG_0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK:            tt.load %[[ARG_0]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
@@ -373,12 +480,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -437,18 +546,20 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
-CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
@@ -499,27 +610,29 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
-CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
-CHECK-NEXT:       tt.make_tensor_ptr
-CHECK-SAME:       <tensor<128xf32>, 1>
-CHECK-NEXT:       tt.load
-CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
 CHECK:            tt.reduce
 CHECK-NEXT:       ^bb0(%[[ARG3:[^:]*]]: f32, %[[ARG4:[^:]*]]: f32):
 CHECK-NEXT:           %[[ADD:.*]] = arith.addf %[[ARG3]], %[[ARG4]] : f32
 CHECK-NEXT:           tt.reduce.return %[[ADD]] : f32
 CHECK-NEXT:       }) : (tensor<128xf32>) -> f32
+CHECK:            %[[ARG1:.*]] = tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK-NEXT:       tt.make_tensor_ptr
+CHECK-SAME:       <tensor<128xf32>, 1>
+CHECK-NEXT:       tt.load
+CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
 CHECK:            tt.addptr %[[P2]]
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -565,12 +678,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:            %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:            %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -581,6 +696,7 @@ CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
 CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:            tt.reduce
@@ -635,12 +751,14 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_EXPECT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK:        #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 127)>
 CHECK:        tt.func @triton_fn(%[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[P3:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 CHECK-DAG:        %[[PID:.*]] = tt.get_program_id x : i32
-CHECK-DAG:        %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
+CHECK-DAG:        %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
 CHECK-DAG:        %[[C127_i64:.*]] = arith.constant 127 : i64
 CHECK-DAG:        %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
-CHECK:            %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C127_i64]] : i64
+CHECK-DAG:        %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK-DAG:        %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:            %[[ARG0:.*]] = tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
@@ -651,6 +769,7 @@ CHECK-NEXT:       tt.make_tensor_ptr
 CHECK-SAME:       <tensor<128xf32>, 1>
 CHECK-NEXT:       tt.load
 CHECK-SAME:       {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<128xf32>, 1> -> tensor<128xf32>
+CHECK:            %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
 CHECK:            %[[ARG2:.*]] = tt.addptr %[[P2]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:       tt.load %[[ARG2]] {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:            tt.reduce
@@ -706,6 +825,7 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
@@ -714,16 +834,16 @@ CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
+CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:           %[[PID_i64:.*]] = arith.index_castui %[[PID_INDEX]] : index to i64
+CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
+CHECK:           tt.splat
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      <tensor<16xf32>, 1>
 CHECK:           tt.load
-CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-CHECK:           tt.addptr %[[P1]], %[[PID_i64]] : !tt.ptr<f32, 1>, i64
-CHECK:           tt.load
-CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 CHECK:           tt.reduce
 CHECK-NEXT:      ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
@@ -740,6 +860,49 @@ CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1>, tensor<16xf32>
                                                 /*arel=*/0}));
 }
 
+TEST_F(TritonFilecheckTest, NestedReducerFusionGetsCodegenedCorrectly) {
+  // TODO(b/327336797): remove filter once V100 codegen in Triton is removed.
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "Doesn't pass on pre-Ampere GPUs.";
+  }
+
+  const std::string kHloText = R"(
+HloModule softmax
+
+fused_convert {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  convert0 = bf16[] convert(p0)
+  convert1 = bf16[] convert(p1)
+  add = bf16[] add(convert0, convert1)
+  ROOT output = f32[] convert(add)
+}
+
+add_computation {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT fusion = f32[] fusion(p0, p1), kind=kLoop, calls=fused_convert
+}
+
+triton_softmax_computation {
+  p0 = pred[10,128]{1,0} parameter(0)
+  p0_f32 = f32[10,128]{1,0} convert(p0)
+  zero = f32[] constant(0)
+  reduce = f32[10]{0} reduce(p0_f32, zero), dimensions={1}, to_apply=add_computation
+  broadcast = f32[10,128]{1,0} broadcast(reduce), dimensions={0}
+  ROOT add = f32[10,128]{1,0} add(p0_f32, broadcast)
+}
+
+ENTRY main {
+  p0 = pred[10,128]{1,0} parameter(0)
+  ROOT softmax = f32[10,128] fusion(p0), kind=kCustom, calls=triton_softmax_computation, backend_config={"fusion_backend_config":{"kind":"__triton_softmax"}}
+})";
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/0,
+                                                           /*arel=*/0}));
+}
+
 TEST_F(
     TritonFilecheckTest,
     DiamondWithAdditionalDiamondParameterBroadcastedAlongBatchDimProducesAccurateResults) {  // NOLINT(whitespace/line_length)
@@ -775,23 +938,24 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 32)>
 CHECK-LABEL:   tt.func @triton_fn(
 CHECK-SAME:        %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 CHECK-SAME:        %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
-CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
+CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i32
+CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
-CHECK-DAG:       %[[C32_i64:.*]] = arith.constant 32 : i64
-CHECK-DAG:       %[[ZERO_OFFSET:.*]] = arith.constant 0 : i64
 CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C32_i64]] : i64
-CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
 CHECK:           tt.make_tensor_ptr
 CHECK-SAME:      <tensor<32xf32>, 1>
 CHECK:           tt.load
 CHECK-SAME:      !tt.ptr<tensor<32xf32>, 1> -> tensor<32xf32>
-CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET]] : !tt.ptr<f32, 1>, i64
+CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 CHECK-NEXT:      tt.make_tensor_ptr
 CHECK-SAME:      <tensor<32xf32>, 1>
 CHECK:           tt.load
@@ -847,6 +1011,7 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   TF_ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                           "triton_softmax_computation", R"(
+// CHECK:         #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
@@ -856,16 +1021,17 @@ ENTRY main {
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
 // CHECK-DAG:       %[[ZERO_OFFSET_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
+// CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK-NEXT:      tt.load
+// CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
 // CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 // CHECK:           tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK:           tt.load
 // CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
-// CHECK-NEXT:      tt.load
-// CHECK-SAME:      {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
 // CHECK:           tt.reduce
 // CHECK:           ^bb0(%[[ARG3:.*]]: f32, %[[ARG4:.*]]: f32):
 // CHECK:             %[[MAX:.*]] = arith.maximumf %[[ARG3]], %[[ARG4]] : f32
@@ -917,23 +1083,25 @@ ENTRY main {
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   ASSERT_OK(CreateTritonIrAndFileCheck(kHloText, config, EmitSoftMax,
                                        "triton_softmax_computation", R"(
+// CHECK: #[[MAP:.*]] = affine_map<()[s0] -> (s0 * 16)>
 // CHECK-LABEL:   tt.func @triton_fn(
 // CHECK-SAME:                       %[[P0:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P1:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32},
 // CHECK-SAME:                       %[[P2:[^:]*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
 // CHECK-DAG:       %[[ZERO_OFFSET_i32:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[C0_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[C1_i64:.*]] = arith.constant 1 : i64
 // CHECK-DAG:       %[[C16_i64:.*]] = arith.constant 16 : i64
-// CHECK-DAG:       %[[ZERO_OFFSET_i64:.*]] = arith.constant 0 : i64
 // CHECK-DAG:       %[[PID:.*]] = tt.get_program_id x : i32
-// CHECK:           %[[PID_i64:.*]] = arith.extsi %[[PID]] : i32 to i64
-// CHECK:           %[[ROW_OFFSET:.*]] = arith.muli %[[PID_i64]], %[[C16_i64]] : i64
-// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
+// CHECK:           %[[PID_INDEX:.*]] = arith.index_castui %[[PID]] : i32 to index
+// CHECK:           tt.addptr %[[P1]], %[[C0_i64]] : !tt.ptr<f32, 1>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK:           tt.load
 // CHECK-SAME:      !tt.ptr<tensor<16xf32>, 1> -> tensor<16xf32>
-// CHECK:           tt.addptr %[[P1]], %[[ZERO_OFFSET_i64]] : !tt.ptr<f32, 1>, i64
+// CHECK:           %[[ROW_OFFSET_INDEX:.*]] = affine.apply #[[MAP]]()[%[[PID_INDEX]]]
+// CHECK:           %[[ROW_OFFSET:.*]] = arith.index_castui %[[ROW_OFFSET_INDEX]] : index to i64
+// CHECK:           tt.addptr %[[P0]], %[[ROW_OFFSET]] : !tt.ptr<f32, 1>, i64
 // CHECK-NEXT:      tt.make_tensor_ptr
 // CHECK-SAME:      <tensor<16xf32>, 1>
 // CHECK-NEXT:      tt.load
@@ -1032,11 +1200,7 @@ CHECK-DAG: %[[ARG_PTR:.*]] = arith.select %[[CONCAT_COND:.*]], %[[P1]], %[[P2]]
 CHECK-DAG: %[[BATCH_STRIDE_P1:.*]] = arith.constant 1280
 CHECK-DAG: %[[BATCH_STRIDE_P2:.*]] = arith.constant 2560
 CHECK-DAG: %[[BATCH_STRIDE:.*]] = arith.select %[[CONCAT_COND_2:.*]], %[[BATCH_STRIDE_P1]], %[[BATCH_STRIDE_P2]]
-COM:       -- Note: we use "CHECK" below voluntarily because the current codegen
-COM:       -- does not do any kind of CSE before returning. This causes
-COM:       -- PID_BATCH to be constructed several times, and by construction,
-COM:       -- the second one is the relevant one.
-CHECK:     %[[PID_BATCH:.*]] = tt.get_program_id y
+CHECK-DAG: %[[PID_BATCH:.*]] = tt.get_program_id y
 CHECK-DAG: %[[OFFSET:.*]] = arith.muli %[[PID_BATCH]], %[[BATCH_STRIDE]]
 CHECK:     %[[BLOCK_BASE_PTR:.*]] = tt.addptr %[[ARG_PTR]], %[[OFFSET]]
 )"));
@@ -1104,7 +1268,7 @@ ENTRY e {
 TEST_F(TritonGemmTest, UseTensorCoresForF32OnAmpere) {
   const std::string kHloText = R"(
 triton_gemm_r {
-  parameter_0 = s8[80,15]{1,0} parameter(0)
+  parameter_0 = f16[80,15]{1,0} parameter(0)
   convert.3 = f32[80,15]{1,0} convert(parameter_0)
   parameter_1 = f32[16,15]{1,0} parameter(1)
   ROOT r.1 = f32[80,16]{1,0} dot(convert.3, parameter_1),
@@ -1113,7 +1277,7 @@ triton_gemm_r {
 
 ENTRY e {
   p1 = f32[16,15]{1,0} parameter(1)
-  p0 = s8[80,15]{1,0} parameter(0)
+  p0 = f16[80,15]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
@@ -1124,17 +1288,10 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
 
-  if (GetCudaComputeCapability().IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma
 )");
-  } else {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
-CHECK: fma
-)");
-  }
 }
 
 TEST_F(TritonGemmTest, FailIfTooMuchShmem) {
@@ -1170,7 +1327,7 @@ ENTRY entry {
   TritonGemmConfig config(16, 32, 512, 1, 4, 8);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
@@ -1185,7 +1342,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context));
@@ -1673,7 +1830,7 @@ ENTRY entry {
   TritonGemmConfig config(512, 512, 32, 1, 1, 2);
   EXPECT_THAT(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context),
@@ -1687,7 +1844,7 @@ ENTRY entry {
   config.block_k = 32;
   TF_CHECK_OK(
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     se::CudaComputeCapability{se::CudaComputeCapability::AMPERE,
                                               /*minor=*/0},
                     dev_info, config, &llvm_module, &EmitMatMul, mlir_context)
@@ -1856,12 +2013,6 @@ ENTRY e {
 
 class TritonGemmLevel2Test : public TritonGemmTest {
  public:
-  void SetUp() override {
-    if (!GetCudaComputeCapability().IsAtLeast(
-            se::CudaComputeCapability::AMPERE)) {
-      GTEST_SKIP() << "Triton fusion on pre-Ampere GPUs is limited.";
-    }
-  }
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_triton_fusion_level(2);
@@ -2855,12 +3006,47 @@ ENTRY e {
                                       /*run_hlo_passes=*/false));
 }
 
-TEST_F(CompareTest, BF16TransposedLHS) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
+TEST_F(CompareTest, F32WithTrivialNonContractingDimension) {
+  const char* hlo_text_ref = R"(
+HloModule r
+
+ENTRY e {
+  arg0 = f32[5,7] parameter(0)
+  arg1 = f32[1,7] parameter(1)
+  gemm = (f32[5,1], s8[0]{0}) custom-call(arg0, arg1),
+    custom_call_target="__cublas$gemm",
+    backend_config={"gemm_backend_config": {"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+  ROOT get-tuple-element = f32[5,1]{1,0} get-tuple-element((f32[5,1]{1,0}, s8[0]{0}) gemm), index=0
+}
+)";
+
+  const char* hlo_text_triton = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,7] parameter(0)
+  p1 = f32[1,7] parameter(1)
+  ROOT dot = f32[5,1] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
 
+ENTRY e {
+  p0 = f32[5,7]{1,0} parameter(0)
+  p1 = f32[1,7]{1,0} parameter(1)
+  ROOT _ = f32[5,1] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":1,
+                         "num_ctas":1}}}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(CompareTest, BF16TransposedLHS) {
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -2901,11 +3087,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, UsingOptinSharedMemoryOnAmpereProducesSameResult) {
-  // On pre-Ampere GPUs the test would use a different amount of shared memory.
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "This test is for Ampere+ GPUs.";
-  }
   const se::DeviceDescription dev_info =
       backend().default_stream_executor()->GetDeviceDescription();
   constexpr int kBytesOfSharedMemoryTested = 64 * 1024;
@@ -2954,7 +3135,7 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
-                    "test_fn", triton_dot_computation, kTritonGemmFusionKind,
+                    "test_fn", triton_dot_computation,
                     GetCudaComputeCapability(), dev_info, triton_gemm_config,
                     &llvm_module, &EmitMatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the
@@ -3071,10 +3252,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, S8BF16) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* hlo_text_ref = R"(
 HloModule r
 
@@ -3122,10 +3299,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitK) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string hlo_text_ref = R"(
 HloModule t, is_scheduled=true
 
@@ -3199,10 +3372,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKBatch) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextRef = R"(
 HloModule m, is_scheduled=true
 
@@ -3265,10 +3434,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, SplitKNontrivialBitcast) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextRef = R"(
 HloModule module, is_scheduled=true
 
@@ -3843,10 +4008,6 @@ ENTRY e {
 }
 
 TEST_F(CompareTest, PredToBF16ConversionWorks) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloTextTest = R"(
 HloModule m, is_scheduled=true
 
@@ -3967,10 +4128,6 @@ class TritonGemmContractionDims : public TritonGemmTest {
 };
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_0) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -3993,10 +4150,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_1_2) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4019,10 +4172,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_2_0_1) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4046,10 +4195,6 @@ ENTRY e {
 }
 
 TEST_F(TritonGemmContractionDims, TritonDotForceContractionDims_1_1) {
-  if (!GetCudaComputeCapability().IsAtLeast(
-          se::CudaComputeCapability::AMPERE)) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const std::string kHloText = R"(
 HloModule m
 
@@ -4113,9 +4258,6 @@ class Triton6xBF16GemmTestWithFlag : public TritonFilecheckTest {
 };
 
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4159,9 +4301,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton6xBF16GemmTestWithFlag, Emit6xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4204,9 +4343,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4237,9 +4373,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleInfinity) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4284,9 +4417,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleNaN) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4343,9 +4473,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 //   x_lo:  5.17201445e+33
 // The result of x*x would be NaN instead of positive infinity.
 TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForInputsWithLargeExponent) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4389,42 +4516,6 @@ CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
                                ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
-TEST_F(Triton6xBF16GemmTestWithFlag, ShouldNotEmit6xBF16GemmForPreAmpere) {
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "6xBF16Gemm should be emitted post-Ampere.";
-  }
-  const char* kHloText = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[5,7] parameter(0)
-  p1 = f32[7,33] parameter(1)
-  ROOT dot = f32[5,33] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[5,7]{1,0} parameter(0)
-  p1 = f32[7,33]{1,0} parameter(1)
-  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config:
-    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
-                          ParseAndReturnVerifiedModule(kHloText));
-
-  CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                R"(
-CHECK-NOT: mma
-CHECK: selp.f32
-CHECK: st.shared{{(\.v[24])?}}.f32
-CHECK: ld.shared{{(\.v[24])?}}.f32
-CHECK: fma.rn.f32
-CHECK: st.shared{{(\.v[24])?}}.f32
-)");
-}
 
 TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
   const char* kHloText = R"(
@@ -4440,18 +4531,13 @@ ENTRY e {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
 CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 )");
-    EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
-                                                  /*arel=*/1e-6}));
-  } else {
-    EXPECT_THAT(CompileToExecutable(std::move(verified_module)),
-                tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
-  }
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
+                                                /*arel=*/1e-6}));
 }
 
 // In these tests, we depend on "algorithm" annotations for selecting the 3XBF16
@@ -4497,9 +4583,6 @@ class Triton3xBF16GemmTestWithFlag : public TritonFilecheckTest {
 };
 
 TEST_F(Triton3xBF16GemmTest, Emit3xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4543,9 +4626,6 @@ CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf
 }
 
 TEST_F(Triton3xBF16GemmTestWithFlag, Emit3xBF16GemmWhenBothInputsAreF32) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4617,9 +4697,6 @@ CHECK-NOT:  tt.dot
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForLongContractingDimension) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4650,9 +4727,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmCanHandleInfinity) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4697,9 +4771,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmCanHandleNaN) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4746,9 +4817,6 @@ CHECK-COUNT-3:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, m
 }
 
 TEST_F(Triton3xBF16GemmTest, Triton3xBF16GemmWorksForInputsWithLargeExponent) {
-  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "No BF16 before Ampere.";
-  }
   const char* kHloText = R"(
 HloModule t
 
@@ -4806,19 +4874,57 @@ ENTRY e {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
-  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
-    CompileAndOptionallyVerifyPtx(std::move(verified_module),
-                                  R"(
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
 CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
 CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
 )");
-    EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5,
-                                                  /*arel=*/1e-5}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-5,
+                                                /*arel=*/1e-5}));
+}
 
-  } else {
-    EXPECT_THAT(CompileToExecutable(std::move(verified_module)),
-                tsl::testing::StatusIs(absl::StatusCode::kUnimplemented));
-  }
+using TritonEmitterTest = TritonGemmTest;
+
+TEST_F(TritonEmitterTest, EmitterFailsIfComputeCapabilityIsBelowAmpere) {
+  const std::string kHloText = R"(
+HloModule module, is_scheduled=true
+
+triton_gemm_dot {
+  p0 = f32[10,20] parameter(0)
+  p1 = f32[20,30] parameter(1)
+  ROOT dot = f32[10,30] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[10,20] parameter(0)
+  p1 = f32[20,30] parameter(1)
+  ROOT r = f32[10,30] fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  const HloComputation* triton_dot_computation =
+      hlo_module->entry_computation()
+          ->root_instruction()
+          ->fused_instructions_computation();
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  llvm::LLVMContext llvm_ctx;
+  llvm::Module llvm_module("module", llvm_ctx);
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
+                    "test_fn", triton_dot_computation,
+                    se::CudaComputeCapability{se::CudaComputeCapability::VOLTA,
+                                              /*minor=*/0},
+                    dev_info, TritonGemmConfig{}, &llvm_module, &EmitMatMul,
+                    mlir_context),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index 1826b5723b27f0..62a695dd4af293 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -110,7 +110,6 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd_emitter.h"
@@ -129,6 +128,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_to_all_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_permute_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_recv_thunk.h"
 #include "xla/service/gpu/runtime/nccl_send_thunk.h"
 #include "xla/service/gpu/runtime/norm_thunk.h"
@@ -136,9 +136,9 @@ limitations under the License.
 #include "xla/service/gpu/runtime/replica_id_thunk.h"
 #include "xla/service/gpu/runtime/send_recv_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/gpu/triton_call.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -1303,57 +1303,6 @@ absl::Status IrEmitterUnnested::EmitCholeskyThunk(const HloInstruction* instr) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-// Converts MLIR dictionary attribute attached to a custom call operation to a
-// custom call thunk attributes that are forwarded to the FFI handler.
-static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
-    mlir::DictionaryAttr dict) {
-  CustomCallThunk::AttributesMap attributes;
-  for (auto& kv : dict) {
-    std::string_view name = kv.getName().strref();
-
-    auto integer = [&](mlir::IntegerAttr integer) {
-      switch (integer.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<int32_t>(integer.getInt());
-          return absl::OkStatus();
-        case 64:
-          attributes[name] = static_cast<int64_t>(integer.getInt());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported integer attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto fp = [&](mlir::FloatAttr fp) {
-      switch (fp.getType().getIntOrFloatBitWidth()) {
-        case 32:
-          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
-          return absl::OkStatus();
-        default:
-          return absl::InvalidArgumentError(absl::StrCat(
-              "Unsupported float attribute bit width for attribute: ", name));
-      }
-    };
-
-    auto str = [&](mlir::StringAttr str) {
-      attributes[name] = str.getValue().str();
-      return absl::OkStatus();
-    };
-
-    TF_RETURN_IF_ERROR(
-        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
-            .Case<mlir::IntegerAttr>(integer)
-            .Case<mlir::FloatAttr>(fp)
-            .Case<mlir::StringAttr>(str)
-            .Default([&](mlir::Attribute) {
-              return absl::InvalidArgumentError(absl::StrCat(
-                  "Unsupported attribute type for attribute: ", name));
-            }));
-  }
-  return attributes;
-}
-
 absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     const HloCustomCallInstruction* instr) {
   const std::string call_target_name = instr->custom_call_target();
@@ -1366,12 +1315,12 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   void* call_target = CustomCallTargetRegistry::Global()->Lookup(
       call_target_name, std::string(platform_name()));
 
-  absl::StatusOr<XLA_FFI_Handler*> handler =
+  absl::StatusOr<ffi::HandlerRegistration> registration =
       ffi::FindHandler(call_target_name, platform_name());
 
   // At least one implementation should be available at run time.
   bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+  bool found_ffi_handler = is_ffi_custom_call && registration.ok();
 
   if (!found_custom_call && !found_ffi_handler) {
     auto& debug_options = ir_emitter_context_->debug_options();
@@ -1503,7 +1452,7 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   auto ffi_thunk = [&] {
     auto& called_computations = instr->called_computations();
     return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr), *handler,
+        Thunk::ThunkInfo::WithProfileAnnotation(instr), registration->handler,
         std::move(operands), std::move(results), std::move(attributes),
         called_computations.empty() ? nullptr : called_computations[0]);
   };
@@ -1704,7 +1653,8 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
     TF_ASSIGN_OR_RETURN(
         auto kernel_arguments,
         KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
-                                instr->operands()));
+                                instr->operands(),
+                                /*dedup=*/false));
     auto launch_dimensions =
         LaunchDimensions(se::BlockDim(call.grid_x, call.grid_y, call.grid_z),
                          se::ThreadDim(call.num_warps * 32));
@@ -1747,7 +1697,8 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
   TF_ASSIGN_OR_RETURN(
       auto kernel_arguments,
       KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
-                              instr->operands()));
+                              instr->operands(),
+                              /*dedup=*/false));
 
   AddThunkToThunkSequence(std::make_unique<KernelThunk>(
       instr, entry->kernel_name, kernel_arguments.args(),
@@ -2230,7 +2181,8 @@ Status IrEmitterUnnested::EmitCollectivePermute(
   // First output is aliased.
   TF_RET_CHECK(
       instr->shape().IsTuple() && instr->shape().tuple_shapes_size() == 2 &&
-      instr->shape().tuple_shapes(0) == instr->shape().tuple_shapes(1));
+      Shape::Equal().IgnoreMemorySpaceInLayout()(
+          instr->shape().tuple_shapes(0), instr->shape().tuple_shapes(1)));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                       GetAllocationSliceForHlo(instr, {1}));
 
@@ -2238,6 +2190,9 @@ Status IrEmitterUnnested::EmitCollectivePermute(
   const auto& hlo_config = ir_emitter_context_->hlo_module().config();
   const int64_t replica_count = hlo_config.replica_count();
   const int64_t partition_count = hlo_config.num_partitions();
+  const int64_t src_memory_space = shape.layout().memory_space();
+  const int64_t dst_memory_space =
+      instr->shape().tuple_shapes(1).layout().memory_space();
 
   if (NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
                                                     partition_count)) {
@@ -2253,7 +2208,9 @@ Status IrEmitterUnnested::EmitCollectivePermute(
     const NcclCollectiveThunk::Buffer buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(shape),
         /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice};
+        /*destination_buffer=*/result_slice,
+        /*source_memory_space=*/src_memory_space,
+        /*destination_memory_space=*/dst_memory_space};
     auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, buffer);
@@ -2670,10 +2627,16 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
+    const int64_t memory_space =
+        instr->shape().IsTuple()
+            ? instr->shape().tuple_shapes(0).layout().memory_space()
+            : instr->shape().layout().memory_space();
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(src->shape()),
         /*source_buffer=*/buffer,
-        /*destination_buffer=*/buffer};
+        /*destination_buffer=*/buffer,
+        /*source_memory_space=*/memory_space,
+        /*destination_memory_space=*/memory_space};
     auto thunk = std::make_unique<NcclSendThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, nccl_buffer);
@@ -2732,14 +2695,23 @@ absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
   TF_RET_CHECK(instr->shape().IsTuple());
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
                       GetAllocationSliceForHlo(instr, {0}));
+
   if (!instr->is_host_transfer()) {
     const auto& hlo_config = ir_emitter_context_->hlo_module().config();
     const int64_t replica_count = hlo_config.replica_count();
     const int64_t partition_count = hlo_config.num_partitions();
+
+    const int64_t memory_space =
+        instr->shape().IsTuple()
+            ? instr->shape().tuple_shapes(0).layout().memory_space()
+            : instr->shape().layout().memory_space();
+
     const NcclCollectiveThunk::Buffer nccl_buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(instr->shape().tuple_shapes(0)),
         /*source_buffer=*/buffer,
-        /*destination_buffer=*/buffer};
+        /*destination_buffer=*/buffer,
+        /*source_memory_space=*/memory_space,
+        /*destination_memory_space=*/memory_space};
     auto thunk = std::make_unique<NcclRecvThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
         instr, replica_count, partition_count, nccl_buffer);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index f378c07872c00f..f91db2db5ecfeb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/runtime/send_recv_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/llvm_ir/loop_emitter.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.cc b/third_party/xla/xla/service/gpu/kernel_arguments.cc
index 80f98779a3a93b..ebdfbd7946cad3 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.cc
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.cc
@@ -60,11 +60,11 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
         return absl::OkStatus();
       }));
 
-  return KernelArguments{std::move(kernel_arguments)};
+  return KernelArguments{std::move(kernel_arguments), /*dedup=*/true};
 }
 
 std::vector<KernelArgument> KernelArguments::ProcessArguments(
-    std::vector<KernelArgument> kernel_arguments) {
+    std::vector<KernelArgument> kernel_arguments, bool dedup) {
   absl::flat_hash_set<BufferAllocation::Slice> buffers_written;
   for (const KernelArgument& kernel_argument : kernel_arguments) {
     if (kernel_argument.written()) {
@@ -79,7 +79,7 @@ std::vector<KernelArgument> KernelArguments::ProcessArguments(
     KernelArgument& kernel_argument = kernel_arguments[i];
 
     auto& first_index = first_indices_for_slices[kernel_argument.slice_];
-    if (first_index) {
+    if (dedup && first_index) {
       const KernelArgument& same = kernel_arguments[*first_index];
       kernel_argument.first_with_same_slice_ = first_index;
       kernel_argument.alignment_ = same.alignment_;
@@ -128,7 +128,7 @@ std::vector<KernelArgument> KernelArguments::ProcessArguments(
 absl::StatusOr<KernelArguments> KernelArguments::Create(
     const BufferAssignment& buffer_assignment,
     const HloInstruction* non_fusion_hlo,
-    absl::Span<const HloInstruction* const> needed_operands) {
+    absl::Span<const HloInstruction* const> needed_operands, bool dedup) {
   std::vector<KernelArgument> kernel_arguments;
   for (const HloInstruction* operand : needed_operands) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
@@ -151,7 +151,7 @@ absl::StatusOr<KernelArguments> KernelArguments::Create(
         return absl::OkStatus();
       }));
 
-  return KernelArguments{std::move(kernel_arguments)};
+  return KernelArguments{std::move(kernel_arguments), dedup};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/kernel_arguments.h b/third_party/xla/xla/service/gpu/kernel_arguments.h
index 2b303383440675..eeff9720ec7fce 100644
--- a/third_party/xla/xla/service/gpu/kernel_arguments.h
+++ b/third_party/xla/xla/service/gpu/kernel_arguments.h
@@ -70,16 +70,17 @@ class KernelArguments {
   static absl::StatusOr<KernelArguments> Create(
       const BufferAssignment& buffer_assignment,
       const HloInstruction* non_fusion_hlo,
-      absl::Span<const HloInstruction* const> needed_operands);
+      absl::Span<const HloInstruction* const> needed_operands,
+      bool dedup = true);
 
   const std::vector<KernelArgument>& args() const { return args_; }
 
  private:
-  explicit KernelArguments(std::vector<KernelArgument> args)
-      : args_(ProcessArguments(std::move(args))) {}
+  explicit KernelArguments(std::vector<KernelArgument> args, bool dedup = true)
+      : args_(ProcessArguments(std::move(args), dedup)) {}
 
   static std::vector<KernelArgument> ProcessArguments(
-      std::vector<KernelArgument> kernel_arguments);
+      std::vector<KernelArgument> kernel_arguments, bool dedup);
 
   std::vector<KernelArgument> args_;
 };
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index d5a8e83080855b..324286a7dcb77c 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,11 +1,11 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index a200803400223d..9fdbcd6790633b 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -170,7 +170,7 @@ static absl::StatusOr<GemmWithDynamicSlice> MatchGemmWithDynamicUpdateSlice(
     HloDynamicUpdateSliceInstruction* update_slice) {
   GemmWithDynamicSlice match(update_slice);
 
-  if (!Match(const_cast<HloInstruction*>(update_slice->operand(1)),
+  if (!Match(const_cast<HloInstruction*>(update_slice->update()),
              OptionalBitcast(&match.bitcast,
                              m::Dot(&match.dot, m::Op(), m::Op())))) {
     return absl::InternalError("failed to match update slice instr");
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 44c1ca0b3edf85..d1b47ba1bbe021 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,9 +1,9 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -38,6 +38,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_command_line_options",
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/stream_executor:device_description",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
@@ -66,7 +67,6 @@ cc_library(
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/platform:rocm_rocdl_path",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/util:env_var",
     ] + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@llvm-project//llvm:AMDGPUCodeGen",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index e1a636da51a265..581a9353f5baa2 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -61,6 +61,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/llvm_type_conversion_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/cuda_libdevice_path.h"
@@ -70,7 +71,6 @@ limitations under the License.
 #include "tsl/platform/random.h"
 #include "tsl/platform/rocm_rocdl_path.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index c4e675a9d6e132..78bb545b920991 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -85,26 +85,28 @@ const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
   return dimension_numbers.rhs_batch_dimensions();
 }
 
-int64_t ContractingDimensionIndex(const HloInstruction& dot,
-                                  const int operand_number) {
+absl::StatusOr<int64_t> ContractingDimensionIndex(const HloInstruction& dot,
+                                                  const int operand_number) {
   const DotDimensionNumbers& dimension_numbers = dot.dot_dimension_numbers();
   if (operand_number == 0) {
-    CHECK_EQ(dimension_numbers.lhs_contracting_dimensions().size(), 1);
+    TF_RET_CHECK(dimension_numbers.lhs_contracting_dimensions().size() == 1);
     return dimension_numbers.lhs_contracting_dimensions(0);
   }
-  CHECK_EQ(dimension_numbers.rhs_contracting_dimensions().size(), 1);
+  TF_RET_CHECK(dimension_numbers.rhs_contracting_dimensions().size() == 1);
   return dimension_numbers.rhs_contracting_dimensions(0);
 }
 
-int64_t NonContractingDimensionIndex(const HloInstruction& dot,
-                                     const int operand_number) {
-  absl::StatusOr<std::vector<int64_t>> non_contracting_dims =
+absl::StatusOr<int64_t> NonContractingDimensionIndex(const HloInstruction& dot,
+                                                     const int operand_number) {
+  TF_ASSIGN_OR_RETURN(int64_t contracting_dim,
+                      ContractingDimensionIndex(dot, operand_number));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<int64_t> non_contracting_dims,
       GetNonContractingDims(dot.operand(operand_number)->shape(),
                             BatchDimensionsForOperand(dot, operand_number),
-                            {ContractingDimensionIndex(dot, operand_number)});
-  TF_CHECK_OK(non_contracting_dims.status());
-  CHECK_EQ(non_contracting_dims->size(), 1);
-  return non_contracting_dims->front();
+                            {contracting_dim}));
+  TF_RET_CHECK(non_contracting_dims.size() == 1);
+  return non_contracting_dims.front();
 }
 
 absl::StatusOr<Shape> GetBatchRowColumnShape(
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index dd1710d2a979a5..22d7f178133835 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -54,12 +54,12 @@ const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
     const HloInstruction& dot, int operand_number);
 
 // Index of the only contracting dimension of dot instruction operand.
-int64_t ContractingDimensionIndex(const HloInstruction& dot,
-                                  int operand_number);
+absl::StatusOr<int64_t> ContractingDimensionIndex(const HloInstruction& dot,
+                                                  int operand_number);
 
 // Index of the only non-contracting dimension of dot instruction operand.
-int64_t NonContractingDimensionIndex(const HloInstruction& dot,
-                                     int operand_number);
+absl::StatusOr<int64_t> NonContractingDimensionIndex(const HloInstruction& dot,
+                                                     int operand_number);
 
 // Normalize shape to (batch, rows, columns) logical dimensions.
 absl::StatusOr<Shape> GetBatchRowColumnShape(
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
index bca0bc01bcef3f..911346cc5435fb 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
@@ -53,7 +53,6 @@ limitations under the License.
 #include "third_party/gpus/nccl/include/info.h"
 #include "third_party/gpus/nccl/include/nccl_common.h"
 #include "third_party/nccl/nccl.h"
-#include "third_party/gpus/nccl/src/include/device.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/primitive_util.h"
@@ -65,10 +64,10 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/sleep_kernel.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/lockable.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.h b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
index 3a2ec6cde1c534..1f1db47bfce802 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.h
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
@@ -31,9 +31,9 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/lockable.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/lib/gtl/int_type.h"
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
index 0d6f24226f4b8f..856b3a4b871a77 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils_default.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "xla/service/gpu/mock_nccl_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 24cc1fe66d6324..34f4547becfc59 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -1,12 +1,13 @@
-# Libraries for performance modeling of HLO.
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
+
+# Libraries for performance modeling of HLO.
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -422,6 +423,7 @@ cc_library(
     hdrs = ["indexing_map.h"],
     deps = [
         ":affine_map_printer",
+        "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -435,8 +437,12 @@ xla_cc_test(
     srcs = ["indexing_map_test.cc"],
     deps = [
         ":affine_map_printer",
+        ":indexing_analysis",
         ":indexing_map",
         ":indexing_test_utils",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_googletest//:gtest",
@@ -479,6 +485,7 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:gather_simplifier",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu/fusions:tiling_util",
@@ -499,6 +506,7 @@ xla_cc_test(
     srcs = ["indexing_analysis_test.cc"],
     deps = [
         ":indexing_analysis",
+        ":indexing_map",
         ":indexing_test_utils",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_traversal",
@@ -513,30 +521,30 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "tile_analysis",
-    srcs = ["tile_analysis.cc"],
-    hdrs = ["tile_analysis.h"],
+    name = "symbolic_tile",
+    srcs = ["symbolic_tile.cc"],
+    hdrs = ["symbolic_tile.h"],
     deps = [
         ":affine_map_printer",
         ":indexing_map",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@local_tsl//tsl/platform:status",
     ],
 )
 
 xla_cc_test(
-    name = "tile_analysis_test",
-    srcs = ["tile_analysis_test.cc"],
+    name = "symbolic_tile_test",
+    srcs = ["symbolic_tile_test.cc"],
     deps = [
         ":affine_map_printer",
         ":indexing_analysis",
         ":indexing_test_utils",
-        ":tile_analysis",
+        ":symbolic_tile",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -548,6 +556,43 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "symbolic_tile_analysis",
+    srcs = ["symbolic_tile_analysis.cc"],
+    hdrs = ["symbolic_tile_analysis.h"],
+    deps = [
+        ":indexing_analysis",
+        ":indexing_map",
+        ":symbolic_tile",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "symbolic_tile_analysis_test",
+    srcs = ["symbolic_tile_analysis_test.cc"],
+    deps = [
+        ":indexing_map",
+        ":symbolic_tile_analysis",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "coalescing_analysis",
     srcs = ["coalescing_analysis.cc"],
@@ -576,6 +621,7 @@ xla_cc_test(
     srcs = ["coalescing_analysis_test.cc"],
     deps = [
         ":coalescing_analysis",
+        ":indexing_map",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
@@ -698,6 +744,7 @@ cc_library(
             "//xla/service:hlo_runner",
             "//xla/service:platform_util",
             "//xla/stream_executor:device_description",
+            "//xla/tsl/util:command_line_flags",
             "@com_google_absl//absl/log",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
@@ -705,7 +752,6 @@ cc_library(
             "@local_tsl//tsl/platform:path",
             "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:status",
-            "@local_tsl//tsl/util:command_line_flags",
         ],
     )
     for sm in [
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 2eed7c5ad26826..e7ad4bd6c01a93 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -115,7 +115,7 @@ void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
                     std::vector<AffineExpr>* symbols,
                     std::vector<int64_t>* indices) {
   if (dim_id < thread_id_to_physical_index.GetDimensionCount()) {
-    Interval dim_range = thread_id_to_physical_index.GetDimensionRange(dim_id);
+    Interval dim_range = thread_id_to_physical_index.GetDimensionBound(dim_id);
     for (int64_t dim_value = dim_range.lower; dim_value <= dim_range.upper;
          ++dim_value) {
       dimensions->push_back(getAffineConstantExpr(dim_value, mlir_context));
@@ -127,7 +127,7 @@ void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
   }
   if (symbol_id < thread_id_to_physical_index.GetSymbolCount()) {
     Interval symbol_range =
-        thread_id_to_physical_index.GetSymbolRange(symbol_id);
+        thread_id_to_physical_index.GetSymbolBound(symbol_id);
     for (int64_t symbol_value = symbol_range.lower;
          symbol_value <= symbol_range.upper; ++symbol_value) {
       symbols->push_back(getAffineConstantExpr(symbol_value, mlir_context));
@@ -229,11 +229,12 @@ bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
   AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
   IndexingMap thread_x_first_32_elements{
       AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
-      {Interval{0, 31}},
-      {}};
+      {DimVar{{0, 31}}},
+      /*range_vars=*/{},
+      /*rt_vars=*/{}};
   IndexingMap thread_x_to_linearized_input =
       thread_x_first_32_elements * thread_id_to_input_indexing_map;
-  thread_x_to_linearized_input.Simplify();
+  thread_x_to_linearized_input.Simplify(GetIndexingMapForInstruction);
   thread_x_to_linearized_input.RemoveUnusedSymbols();
   return EstimateCoalescingViaMemoryTransactionsCount(
       FindContiguousIntervals(thread_x_to_linearized_input), element_type);
@@ -299,7 +300,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
         IndexingMap operand_logical_to_linearized_physical_shape =
             operand_logical_to_physical_map *
             operand_physical_to_linearized_shape;
-        operand_logical_to_linearized_physical_shape.Simplify();
+        operand_logical_to_linearized_physical_shape.Simplify(
+            GetIndexingMapForInstruction);
 
         for (const IndexingMap& operand_indexing_map :
              operand_indexing_maps_it->second) {
@@ -315,7 +317,8 @@ std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
           IndexingMap thread_id_to_linearized_physical_input_map =
               *thread_id_to_hero_operand_map *
               logical_output_to_linearized_physical_input_map;
-          thread_id_to_linearized_physical_input_map.Simplify();
+          thread_id_to_linearized_physical_input_map.Simplify(
+              GetIndexingMapForInstruction);
           result[operand].insert(thread_id_to_linearized_physical_input_map);
         }
       }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index 58325183af50e6..6e8b57d62dcbd3 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -97,8 +97,8 @@ int64_t GetIterationSpaceSize(const IndexingMap& indexing_map,
         return num_iters;
       };
 
-  return get_ranges_iteration_space_size(indexing_map.GetSymbolRanges()) *
-         get_ranges_iteration_space_size(indexing_map.GetDimensionRanges());
+  return get_ranges_iteration_space_size(indexing_map.GetSymbolBounds()) *
+         get_ranges_iteration_space_size(indexing_map.GetDimensionBounds());
 }
 
 EstimateRunTimeData
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index 38479bbc982a38..b71dc91505dbd6 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -28,12 +28,12 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index eca6ccd0dc067b..b0deed5f9f8952 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
 #include "xla/permutation_util.h"
+#include "xla/service/gather_simplifier.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -77,19 +78,6 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
   return indexing;
 }
 
-IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* ctx) {
-  if (shape.IsTuple()) {
-    // Should happen only for variadic reduce. In that case all tuple shapes are
-    // equal.
-    return CreateIdentityMap(shape.tuple_shapes(0), ctx);
-  }
-
-  auto dims = shape.dimensions();
-  IndexingMap identity_map = IndexingMap::FromTensorSizes(
-      AffineMap::getMultiDimIdentityMap(dims.size(), ctx), dims, {});
-  return identity_map;
-}
-
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
     const HloInstruction* instr, MLIRContext* mlir_context) {
   IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
@@ -156,15 +144,6 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
-std::vector<Interval> RangesFromUpperBounds(absl::Span<const int64_t> bounds) {
-  std::vector<Interval> dim_ranges;
-  dim_ranges.reserve(bounds.size());
-  for (int64_t dim : bounds) {
-    dim_ranges.push_back(Interval{0, dim - 1});
-  }
-  return dim_ranges;
-}
-
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
   const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
@@ -173,7 +152,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
   // be adjusted for a particular operand_id.
   mlir::MutableAffineMap affine_map =
       AffineMap::getMultiDimIdentityMap(operand_0_dims.size(), mlir_context);
-  std::vector<Interval> dim_ranges = RangesFromUpperBounds(operand_0_dims);
+  std::vector<DimVar> dim_vars = DimVarsFromTensorSizes(operand_0_dims);
 
   HloInstructionIndexing concat_indexing;
   concat_indexing.indexing_maps.resize(concat->operand_count());
@@ -183,10 +162,10 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
   for (const auto [operand_id, operand] : llvm::enumerate(concat->operands())) {
     affine_map.setResult(concat_dim, concat_dim_expr - offset);
     int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
-    dim_ranges[concat_dim] = Interval{offset, offset + operand_concat_dim - 1};
+    dim_vars[concat_dim] = DimVar{{offset, offset + operand_concat_dim - 1}};
     concat_indexing.indexing_maps[operand_id].insert(
-        IndexingMap(affine_map.getAffineMap(), dim_ranges,
-                    /*symbol_ranges=*/{}));
+        IndexingMap(affine_map.getAffineMap(), dim_vars,
+                    /*range_vars=*/{}, /*rt_vars=*/{}));
     offset += operand_concat_dim;
   }
   return concat_indexing;
@@ -309,6 +288,151 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
       {lhs_indexing_map, rhs_indexing_map});
 }
 
+HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
+    const HloDynamicSliceInstruction* dynamic_slice,
+    MLIRContext* mlir_context) {
+  const Shape& input_shape = dynamic_slice->operand(0)->shape();
+  const Shape& output_shape = dynamic_slice->shape();
+  int64_t rank = output_shape.rank();
+  const int64_t first_index_num = dynamic_slice->first_index_operand_number();
+
+  CHECK(dynamic_slice->operand(first_index_num)->shape().rank() == 0)
+      << "b/118437727: Old form, not supported.";
+  // A map from tensor iteration space to (), because index operands are 0d
+  // tensors.
+  AffineMap empty_results_affine_map = AffineMap::get(
+      /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
+  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
+      empty_results_affine_map, output_shape.dimensions(), {});
+
+  std::vector<RTVar> offsets_rt_vars;
+  offsets_rt_vars.reserve(rank);
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  for (auto [dim, slice_size] :
+       llvm::enumerate(dynamic_slice->dynamic_slice_sizes())) {
+    exprs.push_back(getAffineDimExpr(dim, mlir_context) +
+                    getAffineSymbolExpr(dim, mlir_context));
+    offsets_rt_vars.push_back(
+        RTVar{Interval{0, input_shape.dimensions(dim) - slice_size},
+              dynamic_slice->operand(dim + first_index_num),
+              empty_results_affine_map});
+  }
+  std::vector<IndexingMap> indexing_maps(dynamic_slice->operand_count(),
+                                         start_indices_map);
+  indexing_maps.front() =
+      IndexingMap{AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+                                 mlir_context),
+                  start_indices_map.GetDimVars(), /*range_vars=*/{},
+                  std::move(offsets_rt_vars)};
+  return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
+}
+
+HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
+    const HloDynamicUpdateSliceInstruction* dus, MLIRContext* mlir_context) {
+  const Shape& update_shape = dus->update()->shape();
+  const Shape& output_shape = dus->shape();
+  int64_t rank = output_shape.rank();
+
+  // operand: (d0, ... d_{N-1}) -> (d0, ... d_{N-1})
+  std::vector<AffineExpr> identity;
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    identity.push_back(getAffineDimExpr(dim, mlir_context));
+  }
+  IndexingMap operand_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, /*results=*/identity,
+                     mlir_context),
+      output_shape.dimensions(), {});
+
+  // start_indices: (d0, ... d_{N-1}) -> ()
+  AffineMap empty_results_affine_map = AffineMap::get(
+      /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
+  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
+      empty_results_affine_map, output_shape.dimensions(), {});
+
+  // update: (d_0 - s_0, ..., d_{N-1} - s_{N-1})
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  std::vector<RTVar> rt_vars;
+  rt_vars.reserve(rank);
+  for (auto [dim, slice_size] : llvm::enumerate(update_shape.dimensions())) {
+    exprs.push_back(getAffineDimExpr(dim, mlir_context) -
+                    getAffineSymbolExpr(dim, mlir_context));
+    Interval feasible_values{0, output_shape.dimensions(dim) - slice_size};
+    rt_vars.push_back(RTVar{feasible_values, dus->operand(2 + dim),
+                            empty_results_affine_map});
+  }
+  IndexingMap update_map{AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank,
+                                        /*results=*/exprs, mlir_context),
+                         operand_map.GetDimVars(),
+                         /*range_vars=*/{}, rt_vars};
+
+  std::vector<IndexingMap> indexing_maps(dus->operand_count(),
+                                         start_indices_map);
+  indexing_maps[0] = std::move(operand_map);
+  indexing_maps[1] = std::move(update_map);
+  return HloInstructionIndexing::FromIndexingMaps(indexing_maps);
+}
+
+HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
+    const HloGatherInstruction* gather, MLIRContext* mlir_context) {
+  CHECK(GatherSimplifier::IsSimplifiedGather(gather))
+      << "Non-simplified HLO Gather is not supported.";
+  const Shape& operand_shape = gather->operand(0)->shape();
+  const Shape& indices_shape = gather->operand(1)->shape();
+
+  const GatherDimensionNumbers& dimension_numbers =
+      gather->gather_dimension_numbers();
+  int64_t index_vector_length =
+      indices_shape.dimensions(dimension_numbers.index_vector_dim());
+
+  const Shape& output_shape = gather->shape();
+  int64_t output_rank = output_shape.rank();
+
+  // A map for the `indices` operand of gather. It is always
+  // (d_0, ... d_{rank - 1}) -> (d_0, s_0),
+  // where 0 <= s_0 <= indices_shape[1] - 1.
+  AffineExpr indices_id_dim = getAffineDimExpr(0, mlir_context);
+  std::vector<DimVar> dim_vars =
+      DimVarsFromTensorSizes(output_shape.dimensions());
+  IndexingMap indices_map{
+      AffineMap::get(output_rank, 1,
+                     {indices_id_dim, getAffineSymbolExpr(0, mlir_context)},
+                     mlir_context),
+      dim_vars,
+      {RangeVar{{0, index_vector_length - 1}}},
+      /*rt_vars=*/{}};
+
+  // A map for the `operand` operand of gather, from which we extract slices.
+  // (d_0, ... d_{rank - 1}) -> (d_1 + s0, d_2 + s_1, ...),
+  // where s_i are RTVars that extract indices from the `indices` operand.
+  std::vector<RTVar> rt_vars;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(operand_shape.rank());
+  for (auto [operand_dim_id, slice_size] :
+       llvm::enumerate(gather->gather_slice_sizes())) {
+    int64_t output_dim_id = dimension_numbers.offset_dims(operand_dim_id);
+    exprs.push_back(getAffineDimExpr(output_dim_id, mlir_context));
+
+    if (operand_dim_id >= index_vector_length) continue;
+
+    rt_vars.push_back(RTVar{
+        Interval{0, operand_shape.dimensions(operand_dim_id) - slice_size},
+        gather->operand(1),
+        AffineMap::get(output_rank, /*symbolCount=*/0,
+                       {indices_id_dim,
+                        getAffineConstantExpr(operand_dim_id, mlir_context)},
+                       mlir_context)});
+    exprs.back() =
+        exprs.back() + getAffineSymbolExpr(operand_dim_id, mlir_context);
+  }
+  IndexingMap operand_map = {
+      AffineMap::get(/*dimCount=*/output_rank,
+                     /*symbolCount=*/index_vector_length, exprs, mlir_context),
+      std::move(dim_vars), /*range_vars=*/{}, std::move(rt_vars)};
+  return HloInstructionIndexing::FromIndexingMaps({operand_map, indices_map});
+}
+
 IndexingMap ComputeOutputToInputPadOpIndexingImpl(
     absl::Span<const int64_t> output_dims,
     absl::Span<const int64_t> padding_low,
@@ -318,16 +442,16 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
 
   std::vector<AffineExpr> exprs;
   std::vector<std::pair<AffineExpr, Interval>> constraints;
-  std::vector<Interval> dimension_ranges;
+  std::vector<DimVar> dim_vars;
   exprs.reserve(output_rank);
   constraints.reserve(output_rank);
   int64_t output_dim_id = 0;
   for (const auto [output_dim, pad_low, pad_high, pad_interior] :
        llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
     AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    dimension_ranges.push_back(
-        Interval{std::max(int64_t{0}, pad_low),
-                 std::min(output_dim - 1, output_dim - 1 - pad_high)});
+    dim_vars.push_back(
+        {Interval{std::max(int64_t{0}, pad_low),
+                  std::min(output_dim - 1, output_dim - 1 - pad_high)}});
     if (pad_interior == 0) {
       exprs.push_back(dim_expr - pad_low);
     } else {
@@ -339,7 +463,10 @@ IndexingMap ComputeOutputToInputPadOpIndexingImpl(
   }
   return IndexingMap{
       AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      dimension_ranges, /*symbol_ranges = */ {}, absl::MakeSpan(constraints)};
+      std::move(dim_vars),
+      /*range_vars = */ {},
+      /*rt_vars = */ {},
+      absl::MakeSpan(constraints)};
 }
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
@@ -447,15 +574,11 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
   return instr_indexing;
 }
 
-// Indexing for reduce-window with dilations and non-trivial padding can be
-// represented as a composition of pad op and reduce-window that never goes out
-// of bounds.
-HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
-    const HloReduceWindowInstruction* reduce_window, int output_id,
+IndexingMap ComposeIndexingMapsForWindow(
+    absl::Span<const int64_t> input_dimensions,
+    absl::Span<const int64_t> output_dimensions, const Window& window,
     MLIRContext* mlir_context) {
-  const Shape& input_shape = reduce_window->operand(0)->shape();
-  const Shape& output_shape = GetOutputShape(reduce_window, 0);
-  int64_t rank = input_shape.rank();
+  size_t rank = input_dimensions.size();
 
   // Compute shape of the padded input and the indexing map of pad op required
   // to pad the input.
@@ -466,43 +589,60 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   padding_interior.reserve(rank);
   padded_input_dimensions.reserve(rank);
   SmallVector<AffineExpr, 4> exprs;
-  std::vector<Interval> dim_ranges, symbol_ranges;
+  std::vector<DimVar> dim_vars;
+  std::vector<RangeVar> range_vars;
   exprs.reserve(rank);
-  dim_ranges.reserve(rank);
-  symbol_ranges.reserve(rank);
+  dim_vars.reserve(rank);
+  range_vars.reserve(rank);
   for (const auto& [dim_id, window_config] :
-       llvm::enumerate(reduce_window->window().dimensions())) {
+       llvm::enumerate(window.dimensions())) {
     padding_low.push_back(window_config.padding_low());
     padding_high.push_back(window_config.padding_high());
     // For some reason interior_padding in HLO pad is offset from base_dilations
     // in HLO reduce-window by 1.
     padding_interior.push_back(window_config.base_dilation() - 1);
-    padded_input_dimensions.push_back(input_shape.dimensions(dim_id) +
-                                      window_config.padding_low() +
-                                      window_config.padding_high() +
-                                      (input_shape.dimensions(dim_id) - 1) *
-                                          (window_config.base_dilation() - 1));
+    padded_input_dimensions.push_back(
+        input_dimensions[dim_id] + window_config.padding_low() +
+        window_config.padding_high() +
+        (input_dimensions[dim_id] - 1) * (window_config.base_dilation() - 1));
     AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
     AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
 
-    exprs.push_back(symbol_expr + window_config.stride() * dim_expr);
-    dim_ranges.push_back(Interval{0, output_shape.dimensions(dim_id) - 1});
-    symbol_ranges.push_back(Interval{0, window_config.size() - 1});
+    exprs.push_back(symbol_expr * window_config.window_dilation() +
+                    window_config.stride() * dim_expr);
+    dim_vars.push_back({Interval{0, output_dimensions[dim_id] - 1}});
+    range_vars.push_back({Interval{0, window_config.size() - 1}});
   }
   // Indexing map for pad op that pads the input.
   IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
       padded_input_dimensions, padding_low, padding_high, padding_interior,
       mlir_context);
   // Indexing map for reduce-window, that does not do any padding.
-  IndexingMap reduce_window_indexing_no_padding(
-      AffineMap::get(rank, rank, exprs, mlir_context), dim_ranges,
-      symbol_ranges);
+  IndexingMap input_indexing_no_padding(
+      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
+      /*rt_vars=*/{});
 
   // Composed indexing.
-  IndexingMap inputs_indexing = ComposeIndexingMaps(
-      reduce_window_indexing_no_padding, padded_input_indexing);
-  inputs_indexing.Simplify();
-  inputs_indexing.RemoveUnusedSymbols();
+  IndexingMap result =
+      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
+  result.Simplify(GetIndexingMapForInstruction);
+  result.RemoveUnusedSymbols();
+  return result;
+}
+
+// Indexing for reduce-window with dilations and non-trivial padding can be
+// represented as a composition of pad op and reduce-window that never goes out
+// of bounds.
+HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
+    const HloReduceWindowInstruction* reduce_window, int output_id,
+    MLIRContext* mlir_context) {
+  const Shape& input_shape = reduce_window->operand(0)->shape();
+  const Shape& output_shape = GetOutputShape(reduce_window, 0);
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing = ComposeIndexingMapsForWindow(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce_window->window(), mlir_context);
 
   // Indexing map for the init value.
   IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
@@ -521,6 +661,119 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
   return instr_indexing;
 }
 
+HloInstructionIndexing ComputeOutputToInputConvolutionOpIndexing(
+    const HloConvolutionInstruction* convolution, MLIRContext* mlir_context) {
+  const Shape& input_shape = convolution->operand(0)->shape();
+  const Shape& kernel_shape = convolution->operand(1)->shape();
+  const Shape& output_shape = convolution->shape();
+  const ConvolutionDimensionNumbers& dnums =
+      convolution->convolution_dimension_numbers();
+  size_t rank = output_shape.rank();
+
+  // Collect sizes for input/output spatial dimensions.
+  size_t spatial_rank = rank - 2;
+  std::vector<int64_t> input_spatial_sizes(spatial_rank);
+  std::vector<int64_t> kernel_spatial_sizes(spatial_rank);
+  std::vector<int64_t> output_spatial_sizes(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_spatial_sizes[i] =
+        input_shape.dimensions(dnums.input_spatial_dimensions(i));
+    kernel_spatial_sizes[i] =
+        kernel_shape.dimensions(dnums.kernel_spatial_dimensions(i));
+    output_spatial_sizes[i] =
+        output_shape.dimensions(dnums.output_spatial_dimensions(i));
+  }
+
+  // Indexing map for the input value (spatial dimensions only).
+  // The dimension numbers in the resulting affine expressions have to be
+  // remapped to correspond to the correct output dimensions.
+  IndexingMap input_spatial_indexing =
+      ComposeIndexingMapsForWindow(input_spatial_sizes, output_spatial_sizes,
+                                   convolution->window(), mlir_context);
+  std::vector<AffineExpr> replacement_dims(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    replacement_dims[i] =
+        getAffineDimExpr(dnums.output_spatial_dimensions(i), mlir_context);
+  }
+
+  // Build affine expressions and constraints for input spatial dimensions.
+  std::vector<AffineExpr> input_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_exprs[dnums.input_spatial_dimensions(i)] =
+        input_spatial_indexing.GetAffineMap().getResult(i).replaceDims(
+            replacement_dims);
+  }
+  llvm::DenseMap<AffineExpr, Interval> input_constraints;
+  for (const auto& [key, val] : input_spatial_indexing.GetConstraints()) {
+    input_constraints[key.replaceDims(replacement_dims)] = val;
+  }
+
+  // Build affine expressions for kernel spatial and output dimensions.
+  std::vector<AffineExpr> kernel_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    kernel_exprs[dnums.kernel_spatial_dimensions(i)] =
+        getAffineSymbolExpr(i, mlir_context);
+  }
+  kernel_exprs[dnums.kernel_output_feature_dimension()] =
+      getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
+
+  // Build initial symbol ranges.
+  std::vector<RangeVar> input_symbols = input_spatial_indexing.GetRangeVars();
+  std::vector<RangeVar> kernel_symbols =
+      RangeVarsFromTensorSizes(kernel_spatial_sizes);
+
+  // Add symbol for input feature dimension.
+  input_exprs[dnums.input_feature_dimension()] =
+      getAffineSymbolExpr(input_symbols.size(), mlir_context);
+  kernel_exprs[dnums.kernel_input_feature_dimension()] =
+      getAffineSymbolExpr(kernel_symbols.size(), mlir_context);
+
+  int64_t input_group_size =
+      kernel_shape.dimensions(dnums.kernel_input_feature_dimension());
+  Interval input_feature_range{0, input_group_size - 1};
+  input_symbols.push_back({input_feature_range});
+  kernel_symbols.push_back({input_feature_range});
+
+  // With multiple feature groups, the input feature dimension is equally split.
+  if (convolution->feature_group_count() > 1) {
+    AffineExpr& input_feature = input_exprs[dnums.input_feature_dimension()];
+    AffineExpr dim_expr =
+        getAffineDimExpr(dnums.output_feature_dimension(), mlir_context);
+    input_feature =
+        dim_expr.floorDiv(input_group_size) * input_group_size + input_feature;
+  }
+
+  // With multiple batch groups, the input batch dimension is equally split.
+  AffineExpr batch_dim_expr =
+      getAffineDimExpr(dnums.output_batch_dimension(), mlir_context);
+  if (convolution->batch_group_count() > 1) {
+    int64_t batch_group_size =
+        output_shape.dimensions(dnums.output_batch_dimension());
+    AffineExpr batch_group_expr =
+        getAffineSymbolExpr(input_symbols.size(), mlir_context);
+    input_symbols.push_back({{0, convolution->batch_group_count() - 1}});
+    input_exprs[dnums.input_batch_dimension()] =
+        batch_group_expr * batch_group_size + batch_dim_expr;
+  } else {
+    input_exprs[dnums.input_batch_dimension()] = batch_dim_expr;
+  }
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing(
+      AffineMap::get(rank, input_symbols.size(), input_exprs, mlir_context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), input_symbols,
+      /*rt_vars=*/{}, input_constraints);
+
+  // Indexing map for the kernel value.
+  IndexingMap kernel_indexing(
+      AffineMap::get(rank, kernel_symbols.size(), kernel_exprs, mlir_context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), kernel_symbols,
+      /*rt_vars=*/{});
+
+  return HloInstructionIndexing::FromIndexingMaps(
+      {inputs_indexing, kernel_indexing});
+}
+
 // Computes strides for a shape.
 std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
   int rank = static_cast<int>(dims.size());
@@ -531,7 +784,8 @@ std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
   return strides;
 }
 
-// Computes 1D index given a shape and N-d indexing expressions.
+}  // namespace
+
 AffineExpr LinearizeShape(absl::Span<const int64_t> dims,
                           absl::Span<const AffineExpr> dimension_exprs,
                           MLIRContext* mlir_context) {
@@ -544,7 +798,6 @@ AffineExpr LinearizeShape(absl::Span<const int64_t> dims,
   return linear_index;
 }
 
-// Computes N-d indexing expressions given a linear index and a shape.
 std::vector<AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
                                          AffineExpr linear_index,
                                          MLIRContext* mlir_context) {
@@ -559,6 +812,8 @@ std::vector<AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
   return multi_index;
 }
 
+namespace {
+
 // Computes indexing for "minimal" reshapes, i.e. reshapes that cannot be
 // represented by a series of composed reshapes, i.e. when there are no
 // subshapes in input and output that have the same number of elements.
@@ -684,7 +939,7 @@ HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(input, output, mlir_context),
       output.dimensions(), {});
-  reshape_indexing_map.Simplify();
+  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
@@ -695,7 +950,7 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
   IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
       ComputeReshapeIndexingMap(output, input, mlir_context),
       input.dimensions(), {});
-  reshape_indexing_map.Simplify();
+  reshape_indexing_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
 
@@ -767,7 +1022,7 @@ HloInstructionIndexing ComputeInputToOutputTransposeOpIndexing(
 }  // namespace
 
 IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
-                          MLIRContext* ctx) {
+                          MLIRContext* mlir_context) {
   ShapeUtil::BitcastDecomposition decomposed_bitcast =
       ShapeUtil::DecomposeBitcast(input_shape, output_shape);
 
@@ -779,7 +1034,7 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
         << "Failed to deduce permutation for a bitcast.";
 
     return IndexingMap::FromTensorSizes(
-        ComputeTransposeIndexingMap(permutation.value(), ctx),
+        ComputeTransposeIndexingMap(permutation.value(), mlir_context),
         input_shape.dimensions(), {});
   }
   if (std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
@@ -787,15 +1042,17 @@ IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
     // Note: ComputeReshapeIndexingMap assumes it's computing an output->input
     // indexing, so input and output are reversed.
     return IndexingMap::FromTensorSizes(
-        ComputeReshapeIndexingMap(output_shape, input_shape, ctx),
+        ComputeReshapeIndexingMap(output_shape, input_shape, mlir_context),
         input_shape.dimensions(), {});
   }
   // `trt` stands for transpose-reshape-transpose decomposition of bitcast.
   auto trt = std::get<ShapeUtil::BitcastDecompositionTrt>(decomposed_bitcast);
-  auto transpose_map_1 = ComputeTransposeIndexingMap(trt.transpose1_dims, ctx);
-  auto reshape_map =
-      ComputeReshapeIndexingMap(trt.reshape_shape, trt.transpose1_shape, ctx);
-  auto transpose_map_2 = ComputeTransposeIndexingMap(trt.transpose2_dims, ctx);
+  auto transpose_map_1 =
+      ComputeTransposeIndexingMap(trt.transpose1_dims, mlir_context);
+  auto reshape_map = ComputeReshapeIndexingMap(
+      trt.reshape_shape, trt.transpose1_shape, mlir_context);
+  auto transpose_map_2 =
+      ComputeTransposeIndexingMap(trt.transpose2_dims, mlir_context);
   auto bitcast_map =
       transpose_map_2.compose(reshape_map).compose(transpose_map_1);
   return IndexingMap::FromTensorSizes(bitcast_map, input_shape.dimensions(),
@@ -808,8 +1065,7 @@ HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->shape(),
                                    bitcast->operand(0)->shape(), mlir_context);
-  bitcast_map.Simplify();
-
+  bitcast_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -817,8 +1073,7 @@ HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
   auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
                                    bitcast->shape(), mlir_context);
-  bitcast_map.Simplify();
-
+  bitcast_map.Simplify(GetIndexingMapForInstruction);
   return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
@@ -840,6 +1095,20 @@ AffineMap GetTilingAffineMap(llvm::ArrayRef<AffineExpr> exprs,
 
 }  // namespace
 
+IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* mlir_context) {
+  if (shape.IsTuple()) {
+    // Should happen only for variadic reduce. In that case all tuple shapes are
+    // equal.
+    return CreateIdentityMap(shape.tuple_shapes(0), mlir_context);
+  }
+
+  auto dimensions = shape.dimensions();
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(dimensions.size(), mlir_context),
+      dimensions, {});
+  return identity_map;
+}
+
 llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
     AffineExpr linear, absl::Span<const int64_t> sizes,
     absl::Span<const int64_t> strides) {
@@ -867,32 +1136,36 @@ llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
   return result;
 }
 
-IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
-                                                      MLIRContext* ctx) {
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(
+    const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
-    return IndexingMap(AffineMap::get(ctx), {}, {});
+    return IndexingMap(AffineMap::get(mlir_context),
+                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
       ComputeTransposeIndexingMap(
-          InversePermutation(ToTransposeDimensions(shape.layout())), ctx),
+          InversePermutation(ToTransposeDimensions(shape.layout())),
+          mlir_context),
       ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape)
           .dimensions(),
       {});
 }
 
-IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
-                                                      MLIRContext* ctx) {
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(
+    const Shape& shape, MLIRContext* mlir_context) {
   if (shape.rank() == 0) {
-    return IndexingMap(AffineMap::get(ctx), {}, {});
+    return IndexingMap(AffineMap::get(mlir_context),
+                       /*dim_vars=*/{}, /*range vars=*/{}, /*rt_vars=*/{});
   }
   return IndexingMap::FromTensorSizes(
-      ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()), ctx),
+      ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()),
+                                  mlir_context),
       shape.dimensions(), {});
 }
 
 AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
-                                   mlir::MLIRContext* ctx) {
-  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, ctx),
+                                   MLIRContext* mlir_context) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, mlir_context),
                                           tiling.GetBlockCounts(),
                                           tiling.GetBlockStrides());
   for (auto&& [offset, tile_size] :
@@ -903,13 +1176,13 @@ AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
 }
 
 AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx) {
-  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx),
+                                    MLIRContext* mlir_context) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, mlir_context),
                                           tiling.GetThreadsPerBlock(),
                                           tiling.GetThreadStrides());
   for (int dim = 0; dim < tiling.GetShape().size(); ++dim) {
     if (tiling.GetThreadTileSize()[dim] > 1) {
-      offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, ctx) *
+      offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, mlir_context) *
                                         tiling.GetThreadsPerBlock()[dim];
     }
   }
@@ -917,11 +1190,12 @@ AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
 }
 
 IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
-                                    mlir::MLIRContext* ctx) {
+                                    MLIRContext* mlir_context) {
   return GetIndexingMapForTiling(
-      GetBlockOffsetsForTiling(tiling, ctx),
-      GetThreadOffsetsForTiling(tiling, ctx), tiling.GetNumThreadsPerBlock(),
-      tiling.GetNumBlocks(), tiling.GetThreadTileSize(), tiling.GetShape());
+      GetBlockOffsetsForTiling(tiling, mlir_context),
+      GetThreadOffsetsForTiling(tiling, mlir_context),
+      tiling.GetNumThreadsPerBlock(), tiling.GetNumBlocks(),
+      tiling.GetThreadTileSize(), tiling.GetShape());
 }
 
 IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
@@ -930,20 +1204,21 @@ IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
                                     int64_t num_blocks,
                                     absl::Span<const int64_t> thread_tile_sizes,
                                     absl::Span<const int64_t> tiled_shape) {
+  auto* mlir_context = block_offsets.getContext();
   llvm::SmallVector<AffineExpr, 4> offsets;
   offsets.reserve(block_offsets.getNumResults());
   for (auto [block, thread] :
        llvm::zip(block_offsets.getResults(), thread_offsets.getResults())) {
     offsets.push_back(block + thread);
   }
-  std::vector<Interval> dimension_ranges{
-      {0, threads_per_block - 1}, {}, {}, {0, num_blocks - 1}, {}, {},
+  std::vector<DimVar> dimension_ranges{
+      {{0, threads_per_block - 1}}, {}, {}, {{0, num_blocks - 1}}, {}, {},
   };
   auto affine_map = mlir::AffineMap::get(block_offsets.getNumDims(),
                                          block_offsets.getNumSymbols(), offsets,
-                                         offsets[0].getContext());
+                                         mlir_context);
   IndexingMap map{affine_map, dimension_ranges,
-                  RangesFromUpperBounds(thread_tile_sizes)};
+                  RangeVarsFromTensorSizes(thread_tile_sizes), /*rt_vars=*/{}};
   for (int i = 0; i < tiled_shape.size(); ++i) {
     map.AddConstraint(affine_map.getResult(i), {0, tiled_shape[i] - 1});
   }
@@ -958,7 +1233,7 @@ bool HloInstructionIndexing::Simplify() {
       to_remove.push_back(map);
       if (map.IsUndefined()) {
         to_add.push_back(map);
-      } else if (map.Simplify()) {
+      } else if (map.Simplify(GetIndexingMapForInstruction)) {
         map.RemoveUnusedSymbols();
       } else {
         to_remove.pop_back();
@@ -1073,7 +1348,7 @@ GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
       for (const IndexingMap& producer_map : producer_operand_indexing) {
         for (const IndexingMap& consumer_map : consumer_indexing_maps_copy) {
           auto composed_map = ComposeIndexingMaps(consumer_map, producer_map);
-          composed_map.Simplify();
+          composed_map.Simplify(GetIndexingMapForInstruction);
           composed_map.RemoveUnusedSymbols();
           grouped_indexing_maps[&producer_operand_adaptor.instruction()].insert(
               composed_map);
@@ -1128,9 +1403,18 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto dot = DynCast<HloDotInstruction>(instr)) {
     return ComputeOutputToInputDotOpIndexing(dot, ctx);
   }
+  if (auto dynamic_slice = DynCast<HloDynamicSliceInstruction>(instr)) {
+    return ComputeOutputToInputDynamicSliceOpIndexing(dynamic_slice, ctx);
+  }
+  if (auto dus = DynCast<HloDynamicUpdateSliceInstruction>(instr)) {
+    return ComputeOutputToInputDynamicUpdateSliceOpIndexing(dus, ctx);
+  }
   if (auto fusion = DynCast<HloFusionInstruction>(instr)) {
     return ComputeOutputToInputFusionOpIndexing(fusion, output_id, ctx);
   }
+  if (auto gather = DynCast<HloGatherInstruction>(instr)) {
+    return ComputeOutputToInputGatherOpIndexing(gather, ctx);
+  }
   if (auto iota = DynCast<HloIotaInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
@@ -1144,6 +1428,9 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
     return ComputeOutputToInputReduceWindowOpIndexing(reduce_window, output_id,
                                                       ctx);
   }
+  if (auto convolution = DynCast<HloConvolutionInstruction>(instr)) {
+    return ComputeOutputToInputConvolutionOpIndexing(convolution, ctx);
+  }
   if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
     return ComputeOutputToInputReshapeOpIndexing(reshape, ctx);
   }
@@ -1188,6 +1475,10 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
   if (auto transpose = DynCast<HloTransposeInstruction>(instr)) {
     return ComputeInputToOutputTransposeOpIndexing(transpose, ctx);
   }
+  if (instr->opcode() == HloOpcode::kTuple) {
+    return HloInstructionIndexing::FromIndexingMaps(
+        {CreateIdentityMap(instr->shape().tuple_shapes(input_id), ctx)});
+  }
   // If we cannot compute input-to-output indexing, we return std::nullopt for
   // every op result.
   int64_t num_results =
@@ -1196,21 +1487,29 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
 }
 
 IndexingMap ComputeEpilogueInputToOutputIndexing(
-    const HloInstruction* epilogue_root, mlir::MLIRContext* ctx,
+    const HloInstruction* epilogue_root, MLIRContext* mlir_context,
     std::function<bool(const HloInstruction*)> is_root) {
   auto* instr = epilogue_root;
-  auto root_indexing = CreateIdentityMap(instr->shape(), ctx);
+  auto root_indexing = CreateIdentityMap(instr->shape(), mlir_context);
   while (!is_root(instr)) {
     // There can be multiple users, but they must have compatible indexing maps.
     auto* user = instr->users().front();
-    auto user_indexing =
-        ComputeInputToOutputIndexing(user, user->operand_index(instr), ctx);
+    auto user_indexing = ComputeInputToOutputIndexing(
+        user, user->operand_index(instr), mlir_context);
     root_indexing = root_indexing * *user_indexing.indexing_maps[0].begin();
-    root_indexing.Simplify();
+    root_indexing.Simplify(GetIndexingMapForInstruction);
     instr = user;
   }
   return root_indexing;
 }
 
+IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
+                                         int64_t operand_idx,
+                                         mlir::MLIRContext* mlir_context) {
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(instr, operand_idx, mlir_context);
+  return *indexing.indexing_maps[0].begin();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index 59a56ae750a03d..22012ea472f887 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -9,6 +9,7 @@ You may obtain a copy of the License at
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
@@ -160,10 +161,30 @@ IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
 // Returns the shape of the output of the instruction.
 const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
 
+// Computes 1D index given a shape and N-d indexing expressions.
+mlir::AffineExpr LinearizeShape(
+    absl::Span<const int64_t> dims,
+    absl::Span<const mlir::AffineExpr> dimension_exprs,
+    mlir::MLIRContext* mlir_context);
+
+// Computes N-d indexing expressions given a linear index and a shape.
+std::vector<mlir::AffineExpr> DelinearizeIndex(absl::Span<const int64_t> dims,
+                                               mlir::AffineExpr linear_index,
+                                               mlir::MLIRContext* mlir_context);
+
+// Creates an identity indexing map corresponding to the parameter shape.
+IndexingMap CreateIdentityMap(const Shape& shape,
+                              mlir::MLIRContext* mlir_context);
+
 llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
     mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
     absl::Span<const int64_t> strides);
 
+// Returns the output-to-input indexing map of the first output of `instr`
+IndexingMap GetIndexingMapForInstruction(const HloInstruction* instr,
+                                         int64_t operand_idx,
+                                         mlir::MLIRContext* mlir_context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 7388d194d3eaa4..1ae80e76fa0907 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -626,6 +626,104 @@ TEST_F(IndexingAnalysisTest, ConcatenateOp) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[2,2,258] parameter(0)
+      %of1 = s32[] parameter(1)
+      %of2 = s32[] parameter(2)
+      %of3 = s32[] parameter(3)
+      ROOT %ds = s32[1,2,32] dynamic-slice(s32[2,2,258] %src,
+        s32[] %of1, s32[] %of2, s32[] %of3),
+        dynamic_slice_sizes={1, 2, 32}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)[s0, s1, s2] -> (d0 + s0, d1 + s1, d2 + s2)
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+                s0 in [0, 1]
+                  hlo: %of1 = s32[] parameter(1)
+                  (d0, d1, d2)  -> ()
+                s1 in [0, 0]
+                  hlo: %of2 = s32[] parameter(2)
+                  (d0, d1, d2)  -> ()
+                s2 in [0, 226]
+                  hlo: %of3 = s32[] parameter(3)
+                  (d0, d1, d2) -> ()
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2)  -> ()
+                domain:
+                d0 in [0, 0]
+                d1 in [0, 1]
+                d2 in [0, 31]
+              )"))));
+}
+
+TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      %src = s32[20,30] parameter(0)
+      %upd = s32[5,10] parameter(1)
+      %of1 = s32[] parameter(2)
+      %of2 = s32[] parameter(3)
+      ROOT %dus = s32[20,30] dynamic-update-slice(
+          s32[20,30] %src, s32[5,10] %upd, s32[] %of1, s32[] %of2)
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1) -> (d0, d1)
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)[s0, s1]  -> (d0 - s0, d1 - s1)
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+                s0 in [0, 15]
+                  hlo: %of1 = s32[] parameter(2)
+                  (d0, d1)  -> ()
+                s1 in [0, 20]
+                  hlo: %of2 = s32[] parameter(3)
+                  (d0, d1)  -> ()
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)  -> ()
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)  -> ()
+                domain:
+                d0 in [0, 19]
+                d1 in [0, 29]
+              )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -927,6 +1025,43 @@ TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, GatherOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY main {
+      operand = f32[33,76,70] parameter(0)
+      indices = s32[1806,2] parameter(1)
+      ROOT r = f32[1806,7,8,4] gather(operand, indices), offset_dims={1,2,3},
+                                 collapsed_slice_dims={}, start_index_map={0,1},
+                                 index_vector_dim=1, slice_sizes={7,8,4}
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3)[s0, s1] -> (d1 + s0, d2 + s1, d3)
+                domain:
+                d0 in [0, 1805]
+                d1 in [0, 6]
+                d2 in [0, 7]
+                d3 in [0, 3]
+                s0 in [0, 26]
+                  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+                  (d0, d1, d2, d3) -> (d0, 0)
+                s1 in [0, 68]
+                  hlo: %indices = s32[1806,2]{1,0} parameter(1)
+                  (d0, d1, d2, d3) -> (d0, 1)
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3)[s0] -> (d0, s0)
+                domain:
+                d0 in [0, 1805]
+                d1 in [0, 6]
+                d2 in [0, 7]
+                d3 in [0, 3]
+                s0 in [0, 1]
+              )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -1168,6 +1303,77 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    f {
+      %src = s32[150, 64] parameter(0)
+      %of11 = s32[] parameter(1)
+      %of12 = s32[] parameter(2)
+      %of21 = s32[] parameter(3)
+      %of22 = s32[] parameter(4)
+
+      %ds1 = s32[50, 32] dynamic-slice(s32[150, 64] %src,
+        s32[] %of11, s32[] %of12), dynamic_slice_sizes={50, 32}
+
+      ROOT %ds2 = s32[25, 16] dynamic-slice(s32[50, 32] %ds1,
+        s32[] %of21, s32[] %of22), dynamic_slice_sizes={25, 16}
+    }
+    ENTRY e {
+      %p0 = s32[150, 64] parameter(0)
+      %p1 = s32[] parameter(1)
+      %p2 = s32[] parameter(2)
+      %p3 = s32[] parameter(3)
+      %p4 = s32[] parameter(4)
+      ROOT fusion = s32[25, 16] fusion(p0, p1, p2, p3, p4),
+        kind=kLoop, calls=f
+    }
+  )"));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1)[s0, s1, s2, s3] -> (d0 + s0 + s2, d1 + s1 + s3)
+                domain:
+                d0 in [0, 24]
+                d1 in [0, 15]
+                s0 in [0, 100]
+                  hlo: %of11 = s32[] parameter(1)
+                  (d0, d1) -> ()
+                s1 in [0, 32]
+                  hlo: %of12 = s32[] parameter(2)
+                  (d0, d1) -> ()
+                s2 in [0, 25]
+                  hlo: %of21 = s32[] parameter(3)
+                  (d0, d1) -> ()
+                s3 in [0, 16]
+                  hlo: %of22 = s32[] parameter(4)
+                  (d0, d1) -> ()
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                  (d0, d1) -> ()
+                  domain:
+                  d0 in [0, 24]
+                  d1 in [0, 15]
+                )"))));
+}
+
 TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
@@ -1739,7 +1945,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_Dilation) {
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1772,6 +1978,38 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Dilation) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[7, 3] parameter(0)
+      ROOT reduce-window = f32[4, 3] reduce-window(p0, c_inf),
+       window={size=2x1 pad=0_0x0_0 rhs_dilate=3x1}, to_apply=max
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0] -> (d0 + s0 * 3, d1)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                            s0 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                          )"))));
+}
+
 TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -1856,6 +2094,227 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
                           )"))));
 }
 
+TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,10,6,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,6,5,8] convolution(p0, p1),
+        window={size=3x5 stride=2x2 pad=1_1x2_2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 * 2 + s0 - 1, d2 * 2 + s1 - 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 5]
+                            d2 in [0, 4]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            d1 * 2 + s0 in [1, 12]
+                            d2 * 2 + s1 in [2, 11]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 5]
+                            d2 in [0, 4]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,21,15,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0 lhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, (d1 + s0) floordiv 2, (d2 + s1) floordiv 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 20]
+                            d2 in [0, 14]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            (d1 + s0) mod 2 in [0, 0]
+                            (d2 + s1) mod 2 in [0, 0]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 20]
+                            d2 in [0, 14]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,4] parameter(0)
+      p1 = f32[4,3,5,8] parameter(1)
+      ROOT conv = f32[1,8,2,8] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0 rhs_dilate=2x2}, dim_labels=b01f_i01o->b01f
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0 * 2, d2 + s1 * 2, s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 7]
+                            d2 in [0, 1]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 7]
+                            d2 in [0, 1]
+                            d3 in [0, 7]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,12,10,24] parameter(0)
+      p1 = f32[4,3,5,48] parameter(1)
+      ROOT conv = f32[1,10,6,48] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, feature_group_count=6
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (d0, d1 + s0, d2 + s1, (d3 floordiv 4) * 4 + s2)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 47]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 47]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
+  auto root = ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[14,12,10,4] parameter(0)
+      p1 = f32[4,3,5,21] parameter(1)
+      ROOT conv = f32[2,10,6,21] convolution(p0, p1),
+        window={size=3x5 pad=0_0x0_0}, dim_labels=b01f_i01o->b01f, batch_group_count=7
+    }
+  )");
+  auto input_indexing = GetOutputToInputIndexing(root);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2, s3] -> (d0 + s3 * 2, d1 + s0, d2 + s1, s2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 20]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                            s3 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3)[s0, s1, s2] -> (s2, s0, s1, d3)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 9]
+                            d2 in [0, 5]
+                            d3 in [0, 20]
+                            s0 in [0, 2]
+                            s1 in [0, 4]
+                            s2 in [0, 3]
+                          )"))));
+}
+
 TEST_F(IndexingAnalysisTest, ReverseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -2023,18 +2482,20 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
-      input = s32[1,1,25,1] parameter(0)
-      update = s32[1,1,2,1] parameter(1)
-      start_indices = s32[4] parameter(2)
-      ROOT dyn-update = s32[1,1,25,1] dynamic-update-slice(
-        input, update, start_indices)
+      p0 = f32[20, 20] parameter(0)
+      p1 = f32[4,4] parameter(1)
+      p2 = f32[4,3] parameter(2)
+      ROOT out = f32[4,3] triangular-solve(f32[4,4] p1, f32[4,3] p2),
+        left_side=true,
+        lower=true,
+        transpose_a=NO_TRANSPOSE,
+        unit_diagonal=true
     }
   )");
   auto input_indexing = GetOutputToInputIndexing(root);
   EXPECT_THAT(
       input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap()),
-                  ElementsAre(UndefinedMap())));
+      ElementsAre(ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap())));
 
   auto output_indexing_0 = GetInputToOutputIndexing(root, 0);
   EXPECT_THAT(output_indexing_0.indexing_maps,
@@ -2043,40 +2504,41 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
   auto output_indexing_1 = GetInputToOutputIndexing(root, 1);
   EXPECT_THAT(output_indexing_1.indexing_maps,
               ElementsAre(ElementsAre(UndefinedMap())));
-
-  auto output_indexing_2 = GetInputToOutputIndexing(root, 2);
-  EXPECT_THAT(output_indexing_2.indexing_maps,
-              ElementsAre(ElementsAre(UndefinedMap())));
 }
 
 TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
-      input = f32[20, 20] parameter(0)
-      start_indices = s32[2] parameter(1)
-      lhs = f32[5, 5] dynamic-slice(f32[20,20] input, s32[2] start_indices),
-          dynamic_slice_sizes={5, 5}
-      rhs = f32[5, 5] slice(f32[20, 20] input),
-          slice={[0:20:4], [0:5:1]}
-      ROOT add = f32[5, 5] add(lhs, rhs)
+      p0 = f32[20, 20] parameter(0)
+      p1 = f32[4,4] parameter(1)
+      p2 = f32[4,3] parameter(2)
+      lhs =  f32[4,3] triangular-solve(f32[4,4] p1, f32[4,3] p2),
+        left_side=true,
+        lower=true,
+        transpose_a=NO_TRANSPOSE,
+        unit_diagonal=true
+      rhs = f32[4, 3] slice(f32[20, 20] p0),
+          slice={[0:20:6], [0:5:2]}
+      ROOT add = f32[4, 3] add(lhs, rhs)
     }
     ENTRY e {
       p0 = f32[20, 20] parameter(0)
-      p1 = s32[2] parameter(1)
-      ROOT fusion = f32[5, 5] fusion(p0, p1), kind=kLoop,
+      p1 = f32[4, 4] parameter(1)
+      p2 = f32[4, 3] parameter(2)
+      ROOT fusion = f32[4, 3] fusion(p0, p1, p2), kind=kLoop,
           calls=fused_computation
     }
   )"));
-  EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
-                            (d0, d1) -> (d0 * 4, d1)
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 6, d1 * 2)
                             domain:
-                            d0 in [0, 4]
-                            d1 in [0, 4]
-                          )"),
-                                               UndefinedMap()),
-                          ElementsAre(UndefinedMap())));
+                            d0 in [0, 3]
+                            d1 in [0, 2]
+                          )")),
+                  ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap())));
 }
 
 TEST_F(IndexingAnalysisTest, TilingIndexing) {
@@ -2084,7 +2546,7 @@ TEST_F(IndexingAnalysisTest, TilingIndexing) {
                 /*tile_sizes=*/{8, 1, 4},
                 /*num_threads=*/{1, 4, 4}};
   auto indexing_map = GetIndexingMapForTiling(tiling, &mlir_context_);
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
         (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
           (d3 floordiv 64) * 8 + s0,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc b/third_party/xla/xla/service/gpu/model/indexing_context.h
similarity index 52%
rename from third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc
rename to third_party/xla/xla/service/gpu/model/indexing_context.h
index 5286b5445b8b56..e5dfc6adb7d3c4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel_stub.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_context.h
@@ -13,10 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
+#ifndef XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
+#define XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
 
-namespace stream_executor::gpu {
-namespace delay_kernel {
-void* kernel() { return nullptr; }
-}  // namespace delay_kernel
-}  // namespace stream_executor::gpu
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/model/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+
+class IndexingContext {
+ public:
+  explicit IndexingContext(mlir::MLIRContext* mlir_context)
+      : mlir_context_(mlir_context) {}
+
+  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+
+ private:
+  mlir::MLIRContext* mlir_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_INDEXING_CONTEXT_H_
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 27ec08b41c58bc..a740243bc2e795 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/model/indexing_map.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <numeric>
@@ -24,6 +25,7 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -34,6 +36,10 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
 
@@ -376,7 +382,7 @@ AffineExpr AffineExprSimplifier::SimplifyOnce(AffineExpr expr) {
       auto rhs = SimplifyOnce(binop.getRHS());
 
       // Rewrite `(x // c) * c + (x % c)` to `x`.
-      // TODO(jreiffers): This should also work with (a+b)+c.
+      // This should also work with (a+b)+c.
       auto rewrite_add = [&](AffineExpr a, AffineExpr b) -> AffineExpr {
         if (auto mod = GetConstantRhs(a, AffineExprKind::Mod)) {
           if (auto mul = GetConstantRhs(b, AffineExprKind::Mul); mod == mul) {
@@ -564,6 +570,59 @@ bool SimplifyConstraintRange(AffineExpr* expr, Interval* range) {
   return is_simplified;
 }
 
+// Computes the symbols list replacement to go from
+// [range_vars(second)|rt_vars(second)|range_vars(first)|rt_vars(first)]
+// to
+// [range_vars(second)|range_vars(first)|rt_vars(second)|rt_vars(first)].
+SmallVector<AffineExpr, 4> GetComposedSymbolsPermutationToCorrectOrder(
+    const IndexingMap& first, const IndexingMap& second) {
+  SmallVector<AffineExpr, 4> symbol_replacements;
+  MLIRContext* mlir_context = first.GetMLIRContext();
+  for (int id = 0; id < second.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(getAffineSymbolExpr(id, mlir_context));
+  }
+  int64_t rt_vars_second_start =
+      first.GetRangeVarsCount() + second.GetRangeVarsCount();
+  for (int64_t id = 0; id < second.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_second_start++, mlir_context));
+  }
+  int64_t range_vars_first_start = second.GetRangeVarsCount();
+  for (int64_t id = 0; id < first.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(range_vars_first_start++, mlir_context));
+  }
+  int64_t rt_vars_first_start = rt_vars_second_start + second.GetRTVarsCount();
+  for (int64_t id = 0; id < first.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_first_start++, mlir_context));
+  }
+  return symbol_replacements;
+}
+
+// Computes the symbols list mapping to go from
+// [range_vars(map)|rt_vars(map)]
+// to
+// [range_vars(second)|range_vars(first)|rt_vars(second)|rt_vars(first)].
+SmallVector<AffineExpr, 4> MapSymbolsToComposedSymbolsList(
+    const IndexingMap& map, const IndexingMap& composed) {
+  SmallVector<AffineExpr, 4> symbol_replacements;
+
+  MLIRContext* mlir_context = map.GetMLIRContext();
+  int64_t range_vars_start =
+      composed.GetRangeVarsCount() - map.GetRangeVarsCount();
+  for (int64_t id = 0; id < map.GetRangeVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(range_vars_start++, mlir_context));
+  }
+  int64_t rt_vars_start = composed.GetSymbolCount() - map.GetRTVarsCount();
+  for (int64_t id = 0; id < map.GetRTVarsCount(); ++id) {
+    symbol_replacements.push_back(
+        getAffineSymbolExpr(rt_vars_start++, mlir_context));
+  }
+  return symbol_replacements;
+}
+
 }  // namespace
 
 std::string Interval::ToString() const {
@@ -585,12 +644,35 @@ bool operator==(const Interval& lhs, const Interval& rhs) {
   return lhs.lower == rhs.lower && lhs.upper == rhs.upper;
 }
 
-std::vector<Interval> RangesFromTensorSizes(
+bool operator==(const DimVar& lhs, const DimVar& rhs) {
+  return lhs.bounds == rhs.bounds;
+}
+
+bool operator==(const RangeVar& lhs, const RangeVar& rhs) {
+  return lhs.range == rhs.range;
+}
+
+bool operator==(const RTVar& lhs, const RTVar& rhs) {
+  return lhs.feasible_values == rhs.feasible_values && lhs.hlo == rhs.hlo &&
+         lhs.map == rhs.map;
+}
+
+std::vector<DimVar> DimVarsFromTensorSizes(
     absl::Span<const int64_t> tensor_sizes) {
-  std::vector<Interval> ranges;
+  std::vector<DimVar> ranges;
   ranges.reserve(tensor_sizes.size());
   for (int64_t size : tensor_sizes) {
-    ranges.push_back(Interval{0, size - 1});
+    ranges.push_back({Interval{0, size - 1}});
+  }
+  return ranges;
+}
+
+std::vector<RangeVar> RangeVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes) {
+  std::vector<RangeVar> ranges;
+  ranges.reserve(tensor_sizes.size());
+  for (int64_t size : tensor_sizes) {
+    ranges.push_back({Interval{0, size - 1}});
   }
   return ranges;
 }
@@ -598,18 +680,66 @@ std::vector<Interval> RangesFromTensorSizes(
 IndexingMap IndexingMap::FromTensorSizes(
     AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
     absl::Span<const int64_t> symbol_upper_bounds) {
-  return IndexingMap{affine_map, RangesFromTensorSizes(dim_upper_bounds),
-                     RangesFromTensorSizes(symbol_upper_bounds)};
+  return IndexingMap{affine_map, DimVarsFromTensorSizes(dim_upper_bounds),
+                     RangeVarsFromTensorSizes(symbol_upper_bounds),
+                     /*rt_vars=*/{}};
+}
+
+const Interval& IndexingMap::GetDimensionBound(int64_t dim_id) const {
+  return dim_vars_[dim_id].bounds;
+}
+
+Interval& IndexingMap::GetMutableDimensionBound(int64_t dim_id) {
+  return dim_vars_[dim_id].bounds;
+}
+
+std::vector<Interval> IndexingMap::GetDimensionBounds() const {
+  std::vector<Interval> bounds;
+  bounds.reserve(affine_map_.getNumDims());
+  for (const auto& dim : dim_vars_) {
+    bounds.push_back(dim.bounds);
+  }
+  return bounds;
+}
+
+const Interval& IndexingMap::GetSymbolBound(int64_t symbol_id) const {
+  // Because affine map symbols are packed like [range_vars, rt_vars],
+  // we have to pick the correct bounds.
+  int64_t range_var_count = GetRangeVarsCount();
+  return symbol_id < range_var_count
+             ? range_vars_[symbol_id].range
+             : rt_vars_[symbol_id - range_var_count].feasible_values;
+}
+
+Interval& IndexingMap::GetMutableSymbolBound(int64_t symbol_id) {
+  // Because affine map symbols are packed like [range_vars, rt_vars],
+  // we have to pick the correct bounds.
+  int64_t range_var_count = GetRangeVarsCount();
+  return symbol_id < range_var_count
+             ? range_vars_[symbol_id].range
+             : rt_vars_[symbol_id - range_var_count].feasible_values;
+}
+
+std::vector<Interval> IndexingMap::GetSymbolBounds() const {
+  std::vector<Interval> bounds;
+  bounds.reserve(affine_map_.getNumSymbols());
+  for (const auto& range_var : range_vars_) {
+    bounds.push_back(range_var.range);
+  }
+  for (const auto& rt_var : rt_vars_) {
+    bounds.push_back(rt_var.feasible_values);
+  }
+  return bounds;
 }
 
 void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
   if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
-    Interval& current_range = dim_ranges_[dim_expr.getPosition()];
+    Interval& current_range = GetMutableDimensionBound(dim_expr.getPosition());
     current_range = Intersect(current_range, range);
     return;
   }
   if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
-    Interval& current_range = symbol_ranges_[symbol_expr.getPosition()];
+    Interval& current_range = GetMutableSymbolBound(symbol_expr.getPosition());
     current_range = Intersect(current_range, range);
     return;
   }
@@ -626,8 +756,8 @@ void IndexingMap::AddConstraint(mlir::AffineExpr expr, Interval range) {
 bool IndexingMap::ConstraintsSatisfied(
     ArrayRef<AffineExpr> dim_const_exprs,
     ArrayRef<AffineExpr> symbol_const_exprs) const {
-  CHECK(dim_const_exprs.size() == GetDimensionCount());
-  CHECK(symbol_const_exprs.size() == GetSymbolCount());
+  CHECK(dim_const_exprs.size() == affine_map_.getNumDims());
+  CHECK(symbol_const_exprs.size() == affine_map_.getNumSymbols());
   if (IsKnownEmpty()) {
     return false;
   }
@@ -655,14 +785,17 @@ SmallVector<int64_t, 4> IndexingMap::Evaluate(
 }
 
 bool IndexingMap::IsKnownEmpty() const {
-  auto is_infeasible = [](const Interval& range) {
-    return range.lower > range.upper;
-  };
-  return llvm::any_of(dim_ranges_, is_infeasible) ||
-         llvm::any_of(symbol_ranges_, is_infeasible) ||
+  return llvm::any_of(dim_vars_,
+                      [](const DimVar& dim_var) {
+                        return dim_var.bounds.lower > dim_var.bounds.upper;
+                      }) ||
+         llvm::any_of(range_vars_,
+                      [](const RangeVar& range_var) {
+                        return range_var.range.lower > range_var.range.upper;
+                      }) ||
          llvm::any_of(constraints_,
                       [&](const std::pair<AffineExpr, Interval>& item) {
-                        return is_infeasible(item.second);
+                        return item.second.lower > item.second.upper;
                       });
 }
 
@@ -749,14 +882,24 @@ void IndexingMap::Print(std::ostream& out,
                         const AffineMapPrinter& printer) const {
   printer.Print(out, affine_map_);
   out << "\ndomain:\n";
-  for (const auto& [index, range] : llvm::enumerate(dim_ranges_)) {
+  for (const auto& [index, dim_var] : llvm::enumerate(dim_vars_)) {
     out << printer.GetDimensionName(static_cast<int64_t>(index)) << " in ";
-    range.Print(out);
+    dim_var.bounds.Print(out);
     out << '\n';
   }
-  for (const auto& [index, range] : llvm::enumerate(symbol_ranges_)) {
+  for (const auto& [index, range_var] : llvm::enumerate(range_vars_)) {
     out << printer.GetSymbolName(static_cast<int64_t>(index)) << " in ";
-    range.Print(out);
+    range_var.range.Print(out);
+    out << '\n';
+  }
+  int64_t range_vars_count = GetRangeVarsCount();
+  for (const auto& [index, rt_var] : llvm::enumerate(rt_vars_)) {
+    out << printer.GetSymbolName(static_cast<int64_t>(range_vars_count + index))
+        << " in ";
+    rt_var.feasible_values.Print(out);
+    out << "\n  hlo: "
+        << (rt_var.hlo == nullptr ? "NULL" : rt_var.hlo->ToString()) << "\n  ";
+    printer.Print(out, rt_var.map);
     out << '\n';
   }
   std::vector<std::string> expr_range_strings;
@@ -774,6 +917,10 @@ void IndexingMap::Print(std::ostream& out,
   }
 }
 
+MLIRContext* IndexingMap::GetMLIRContext() const {
+  return IsUndefined() ? nullptr : affine_map_.getContext();
+}
+
 std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
   AffineMapPrinter printer;
   indexing_map.Print(out, printer);
@@ -782,8 +929,9 @@ std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
 
 bool operator==(const IndexingMap& lhs, const IndexingMap& rhs) {
   return lhs.GetAffineMap() == rhs.GetAffineMap() &&
-         lhs.GetDimensionRanges() == rhs.GetDimensionRanges() &&
-         lhs.GetSymbolRanges() == rhs.GetSymbolRanges();
+         lhs.GetDimVars() == rhs.GetDimVars() &&
+         lhs.GetRangeVars() == rhs.GetRangeVars() &&
+         lhs.GetRTVars() == rhs.GetRTVars();
 }
 
 IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
@@ -803,9 +951,11 @@ IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
 // RangeEvaluator for every constraint. Note that we start with "expr"
 // simplification, because the ranges of constraints were already optimized once
 // when IndexingMap was constructed.
-bool IndexingMap::Simplify() {
+bool IndexingMap::Simplify(IndexingMapProvider indexing_map_provider) {
   if (IsUndefined()) return false;
 
+  bool rtvars_were_eliminated = ReplaceConstantRTVars(indexing_map_provider);
+
   // Simplify constraints to shrink the lower/upper bounds of dims and symbols.
   bool constraints_were_simplified = false;
   while (true) {
@@ -813,21 +963,26 @@ bool IndexingMap::Simplify() {
     constraints_were_simplified = true;
     if (!SimplifyConstraintRanges()) break;
   }
+  // Simplify dependent constraints.
+  MergeModConstraints();
   // Simplify affine_map using the optimized ranges.
   // Potentially, we can be smarter about recreating the range_evaluator.
-  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
   AffineMap simplified_affine_map =
       AffineExprSimplifier(&range_evaluator).Simplify(affine_map_);
   bool affine_map_was_simplified = simplified_affine_map != affine_map_;
   if (affine_map_was_simplified) {
     affine_map_ = simplified_affine_map;
   }
-  return affine_map_was_simplified || constraints_were_simplified;
+  return affine_map_was_simplified || constraints_were_simplified ||
+         rtvars_were_eliminated;
 }
 
 bool IndexingMap::SimplifyConstraintExprs() {
   // Simplify affine expression in the constraints_.
-  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
   AffineExprSimplifier simplifier(&range_evaluator);
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
@@ -963,19 +1118,26 @@ void IndexingMap::RemoveUnusedSymbols() {
   unsigned num_symbols_after = affine_map_.getNumSymbols();
   if (num_symbols_after == num_symbols_before) return;
 
-  std::vector<Interval> compressed_symbol_ranges_;
+  std::vector<RangeVar> compressed_range_vars;
+  std::vector<RTVar> compressed_rt_vars;
   MLIRContext* mlir_context = GetMLIRContext();
   int64_t used_symbols_count = 0;
   std::vector<AffineExpr> symbol_replacements(
       num_symbols_before, getAffineConstantExpr(0, mlir_context));
+  auto range_vars_count = range_vars_.size();
   for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
     if (!unused_symbols_bit_vector[i]) {
-      compressed_symbol_ranges_.push_back(symbol_ranges_[i]);
+      if (i < range_vars_count) {
+        compressed_range_vars.push_back(range_vars_[i]);
+      } else {
+        compressed_rt_vars.push_back(rt_vars_[i - range_vars_count]);
+      }
       symbol_replacements[i] =
           getAffineSymbolExpr(used_symbols_count++, mlir_context);
     }
   }
-  symbol_ranges_ = std::move(compressed_symbol_ranges_);
+  range_vars_ = std::move(compressed_range_vars);
+  rt_vars_ = std::move(compressed_rt_vars);
   std::vector<AffineExpr> to_remove;
   std::vector<std::pair<AffineExpr, Interval>> to_add;
   for (const auto& [expr, range] : constraints_) {
@@ -992,6 +1154,67 @@ void IndexingMap::RemoveUnusedSymbols() {
   }
 }
 
+void IndexingMap::MergeModConstraints() {
+  RangeEvaluator range_evaluator(GetDimensionBounds(), GetSymbolBounds(),
+                                 GetMLIRContext());
+
+  // Group constraints by LHS.
+  llvm::DenseMap<AffineExpr, llvm::SmallVector<AffineBinaryOpExpr, 2>>
+      grouped_constraints;
+  for (const auto& [expr, _] : constraints_) {
+    if (expr.getKind() != AffineExprKind::Mod) continue;
+    auto binop = mlir::cast<AffineBinaryOpExpr>(expr);
+    grouped_constraints[binop.getLHS()].push_back(binop);
+  }
+
+  // Merge constraints of type MOD.
+  // (X mod 3 == 0) & (X mod 2 == 0) => (X mod 6 == 0)
+  for (const auto& [lhs, binops] : grouped_constraints) {
+    llvm::DenseMap<int64_t, llvm::SmallVector<AffineBinaryOpExpr, 2>>
+        mod_groups;
+    for (const auto& binop : binops) {
+      Interval mod_result = constraints_[binop];
+      if (mod_result.IsPoint()) {
+        mod_groups[mod_result.lower].push_back(binop);
+      }
+    }
+    if (mod_groups.empty()) continue;
+
+    // Update domain for dimensions and symbols only.
+    Interval* update = nullptr;
+    if (lhs.getKind() == AffineExprKind::DimId) {
+      update = &GetMutableDimensionBound(
+          mlir::cast<AffineDimExpr>(lhs).getPosition());
+    } else if (lhs.getKind() == AffineExprKind::SymbolId) {
+      update = &GetMutableSymbolBound(
+          mlir::cast<AffineSymbolExpr>(lhs).getPosition());
+    }
+    for (const auto& [res, ops] : mod_groups) {
+      // Calculate least common multiple for the divisors.
+      int64_t div = 1;
+      for (const auto& op : ops) {
+        int64_t rhs_value =
+            range_evaluator.ComputeExpressionRange(op.getRHS()).lower;
+        div = std::lcm(div, rhs_value);
+      }
+      // Replace multiple constraints with a merged one.
+      if (ops.size() > 1) {
+        for (const auto& op : ops) {
+          constraints_.erase(op);
+        }
+        constraints_[lhs % div] = Interval{res, res};
+      }
+      // Update dimension and symbol bounds.
+      if (update != nullptr) {
+        int64_t l = (update->lower / div) * div + res;
+        update->lower = l >= update->lower ? l : l + div;
+        int64_t h = (update->upper / div) * div + res;
+        update->upper = h <= update->upper ? h : h - div;
+      }
+    }
+  }
+}
+
 IndexingMap ComposeIndexingMaps(const IndexingMap& first,
                                 const IndexingMap& second) {
   if (second.IsUndefined() || first.IsUndefined()) {
@@ -1001,18 +1224,29 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   AffineMap composed_map = producer_affine_map.compose(first.GetAffineMap());
 
   // The symbols in the composed map, i.e. combined
-  // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
-  // symbols(consumer_map)].
-  std::vector<Interval> combined_symbol_ranges;
-  combined_symbol_ranges.reserve(second.GetSymbolCount() +
-                                 first.GetSymbolCount());
-  for (const Interval& symbol_range : llvm::concat<const Interval>(
-           second.GetSymbolRanges(), first.GetSymbolRanges())) {
-    combined_symbol_ranges.push_back(symbol_range);
-  }
-
-  IndexingMap composed_indexing_map(composed_map, first.GetDimensionRanges(),
-                                    std::move(combined_symbol_ranges));
+  // producer_map.compose(consumer_map) are packed as
+  // [range_vars(second)|rt_vars(second)|range_vars(first)|rt_vars(first)].
+  std::vector<RangeVar> combined_range_vars;
+  combined_range_vars.reserve(second.GetRangeVarsCount() +
+                              first.GetRangeVarsCount());
+  for (const RangeVar& range_var : llvm::concat<const RangeVar>(
+           second.GetRangeVars(), first.GetRangeVars())) {
+    combined_range_vars.push_back(range_var);
+  }
+  std::vector<RTVar> combined_rt_vars;
+  combined_rt_vars.reserve(second.GetRTVarsCount() + first.GetRTVarsCount());
+  for (const RTVar& rt_var :
+       llvm::concat<const RTVar>(second.GetRTVars(), first.GetRTVars())) {
+    combined_rt_vars.push_back(rt_var);
+  }
+  // The symbols in the composed map have to be permuted to keep the invariant
+  // that range_vars go before rt_vars in the composed affine map symbols list.
+  SmallVector<AffineExpr, 4> symbol_replacements =
+      GetComposedSymbolsPermutationToCorrectOrder(first, second);
+  IndexingMap composed_indexing_map(composed_map, first.GetDimVars(),
+                                    std::move(combined_range_vars),
+                                    std::move(combined_rt_vars));
+
   // Add constraints that are already present in the producer_map. We have to
   // compute consumer_map(producer_constraints). To keep all symbols and
   // dimension IDs the same as in the `composed_indexing_map.affine_map`, we
@@ -1028,29 +1262,183 @@ IndexingMap ComposeIndexingMaps(const IndexingMap& first,
   auto constraints_map = AffineMap::get(
       producer_affine_map.getNumDims(), producer_affine_map.getNumSymbols(),
       constraints, producer_affine_map.getContext());
-  auto remapped_constraints = constraints_map.compose(first.GetAffineMap());
+  auto remapped_constraints =
+      constraints_map.compose(first.GetAffineMap())
+          .replaceDimsAndSymbols(/*dimReplacements=*/{}, symbol_replacements,
+                                 composed_indexing_map.GetDimensionCount(),
+                                 composed_indexing_map.GetSymbolCount());
   for (const auto& [expr, range] :
        llvm::zip(remapped_constraints.getResults(), constraints_ranges)) {
     composed_indexing_map.AddConstraint(expr, range);
   }
   // Remap symbol ids and add constraints that are already present in the
   // consumer_map.
+  SmallVector<AffineExpr, 4> first_map_symbols_to_composed_symbols =
+      MapSymbolsToComposedSymbolsList(first, composed_indexing_map);
   for (const auto& [expr, range] : first.GetConstraints()) {
     composed_indexing_map.AddConstraint(
-        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
-        range);
+        expr.replaceSymbols(first_map_symbols_to_composed_symbols), range);
   }
   // Add constraints for consumer's codomain w.r.t. producer's domain.
   for (auto [index, expr] :
        llvm::enumerate(first.GetAffineMap().getResults())) {
     Interval producer_dim_range =
-        second.GetDimensionRange(static_cast<int64_t>(index));
+        second.GetDimensionBound(static_cast<int64_t>(index));
     composed_indexing_map.AddConstraint(
-        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
+        expr.replaceSymbols(first_map_symbols_to_composed_symbols),
         producer_dim_range);
   }
   return composed_indexing_map;
 }
 
+bool IndexingMap::RescaleSymbols() {
+  MergeModConstraints();
+
+  std::vector<AffineExpr> to_delete;
+
+  for (const auto& [expr, range] : constraints_) {
+    if (range.lower != range.upper) continue;
+    auto shift_value = range.lower;
+
+    if (expr.getKind() != AffineExprKind::Mod) continue;
+    auto mod_expr = mlir::cast<AffineBinaryOpExpr>(expr);
+
+    auto constant_expr = mlir::dyn_cast<AffineConstantExpr>(mod_expr.getRHS());
+    if (!constant_expr) continue;
+
+    // We don't rescale mod expressions with non-positive divisors.
+    if (constant_expr.getValue() <= 0) continue;
+    auto scaling_factor = constant_expr.getValue();
+
+    if (mod_expr.getLHS().getKind() != AffineExprKind::SymbolId) continue;
+    auto symbol_expr = mlir::cast<AffineSymbolExpr>(mod_expr.getLHS());
+
+    affine_map_ = affine_map_.replace(
+        symbol_expr, constant_expr * symbol_expr + shift_value,
+        affine_map_.getNumDims(), affine_map_.getNumSymbols());
+
+    for (auto& [other_expr, other_range] : constraints_) {
+      if (other_expr == expr) continue;
+      if (!other_expr.isFunctionOfSymbol(symbol_expr.getPosition())) continue;
+
+      other_expr = other_expr.replace(
+          symbol_expr, constant_expr * symbol_expr + shift_value);
+    }
+
+    auto& symbol_range = range_vars_[symbol_expr.getPosition()].range;
+    symbol_range.lower = (symbol_range.lower - shift_value) / scaling_factor;
+    symbol_range.upper = (symbol_range.upper - shift_value) / scaling_factor;
+
+    to_delete.emplace_back(expr);
+  }
+
+  for (const auto& expr : to_delete) {
+    constraints_.erase(expr);
+  }
+
+  return !to_delete.empty();
+}
+
+// Returns either:
+// 1. an AffineExpr if the RTVar folds entirely into a constant expression
+// 2. an updated RTVar if some partial optimization was possible
+// 3. an unchanged RTVar if no optimization was possible
+static std::variant<AffineExpr, RTVar> OptimizeRTVar(
+    RTVar rt_var, MLIRContext* mlir_context,
+    IndexingMap::IndexingMapProvider indexing_map_provider) {
+  while (true) {
+    if (auto constant_expr = DynCast<HloConstantInstruction>(rt_var.hlo)) {
+      if (rt_var.map.isConstant()) {
+        const auto idx = rt_var.map.getConstantResults();
+        return getAffineConstantExpr(
+            constant_expr->literal().GetIntegralAsS64(idx).value(),
+            mlir_context);
+      }
+      return rt_var;
+    }
+
+    if (auto iota_expr = DynCast<HloIotaInstruction>(rt_var.hlo)) {
+      auto iota_dimension = iota_expr->iota_dimension();
+      CHECK(iota_dimension < rt_var.map.getNumResults());
+      return rt_var.map.getResults()[iota_dimension];
+    }
+
+    auto is_indexing_transformation = [](const HloInstruction* instr) {
+      return instr->opcode() == HloOpcode::kBitcast ||
+             instr->opcode() == HloOpcode::kBroadcast ||
+             instr->opcode() == HloOpcode::kReshape ||
+             instr->opcode() == HloOpcode::kReverse ||
+             instr->opcode() == HloOpcode::kSlice ||
+             instr->opcode() == HloOpcode::kTranspose;
+    };
+
+    if (is_indexing_transformation(rt_var.hlo)) {
+      auto instr_indexing_map =
+          indexing_map_provider(rt_var.hlo, 0, mlir_context);
+
+      rt_var.hlo = rt_var.hlo->operand(0);
+      rt_var.map = instr_indexing_map.GetAffineMap().compose(rt_var.map);
+      continue;
+    }
+
+    return rt_var;
+  }
+}
+
+bool IndexingMap::ReplaceConstantRTVars(
+    IndexingMap::IndexingMapProvider indexing_map_provider) {
+  if (rt_vars_.empty()) return false;
+
+  std::vector<size_t> to_delete;
+
+  for (auto index = 0; index < rt_vars_.size(); ++index) {
+    auto& rt_var = rt_vars_[index];
+    auto result =
+        OptimizeRTVar(rt_var, GetMLIRContext(), indexing_map_provider);
+
+    // If we got an RTVar back, then we just replace it and move on.
+    if (std::holds_alternative<RTVar>(result)) {
+      rt_var = std::get<RTVar>(std::move(result));
+      continue;
+    }
+
+    // But if we received an AffineExpr we can eliminate the RTVar from
+    // all expressions in the indexing map.
+    auto folded_expr = std::get<AffineExpr>(std::move(result));
+
+    // range_vars and rt_vars share the symbol space, with the rt_vars coming
+    // after the range_vars.
+    auto symbol_index = range_vars_.size() + index;
+    affine_map_ = affine_map_.replace(
+        {{mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
+          folded_expr}});
+
+    llvm::DenseMap<AffineExpr, AffineExpr> replacements;
+
+    for (const auto& [constraint, interval] : constraints_) {
+      auto modified_constraint = constraint.replace(
+          mlir::getAffineSymbolExpr(symbol_index, GetMLIRContext()),
+          folded_expr);
+
+      if (constraint == modified_constraint) continue;
+      replacements[constraint] = modified_constraint;
+    }
+
+    for (const auto& [old_expr, new_expr] : replacements) {
+      auto interval = constraints_.at(old_expr);
+      constraints_.erase(old_expr);
+      constraints_[new_expr] = interval;
+    }
+
+    to_delete.emplace_back(index);
+  }
+
+  for (auto index : llvm::reverse(to_delete)) {
+    rt_vars_.erase(rt_vars_.begin() + index);
+  }
+
+  return !to_delete.empty();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 655d877745860f..bfc8abf30bdd39 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -31,6 +31,8 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 
 namespace xla {
@@ -135,7 +137,53 @@ class RangeEvaluator {
   llvm::DenseMap<mlir::AffineExpr, Interval> expression_ranges_cache_;
 };
 
-std::vector<Interval> RangesFromTensorSizes(
+// Dimension variable represents a dimension of a tensor or a GPU grid.
+// Dimensions correspond to the dimension parameter of `affine_map_`.
+struct DimVar {
+  Interval bounds;
+};
+bool operator==(const DimVar& lhs, const DimVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const DimVar& dimension) {
+  return H::combine(std::move(h), dimension.bounds);
+}
+
+// RangeSymbol variable represents a range of values, e.g. to compute a single
+// element of the reduction's result we need a range of values from the input
+// tensor. RangeSymbol variables correspond to the front portion of the
+// symbols in `affine_map_`.
+struct RangeVar {
+  Interval range;
+};
+bool operator==(const RangeVar& lhs, const RangeVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const RangeVar& range_var) {
+  return H::combine(std::move(h), range_var.range);
+}
+
+// RTSymbol variable represents a runtime symbol, e.g. a dynamic offset in
+// HLO dynamic-update-slice op. RTSymbol variables correspond to the back
+// portion of the symbols in `affine_map_`.
+struct RTVar {
+  Interval feasible_values;
+  const HloInstruction* hlo;
+  mlir::AffineMap map;
+};
+bool operator==(const RTVar& lhs, const RTVar& rhs);
+
+template <typename H>
+H AbslHashValue(H h, const RTVar& rt_var) {
+  llvm::hash_code map_hash = llvm::hash_combine(rt_var.map);
+  return H::combine(std::move(h), rt_var.feasible_values, rt_var.hlo,
+                    static_cast<size_t>(map_hash));
+}
+
+std::vector<DimVar> DimVarsFromTensorSizes(
+    absl::Span<const int64_t> tensor_sizes);
+
+std::vector<RangeVar> RangeVarsFromTensorSizes(
     absl::Span<const int64_t> tensor_sizes);
 
 // Contains an affine map with N dimension expressions and M symbols:
@@ -166,23 +214,24 @@ std::vector<Interval> RangesFromTensorSizes(
 class IndexingMap {
  public:
   IndexingMap(
-      mlir::AffineMap affine_map, std::vector<Interval> dim_ranges,
-      std::vector<Interval> symbol_ranges,
+      mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
+      std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
       absl::Span<std::pair<mlir::AffineExpr, Interval>> constraints = {})
       : affine_map_(affine_map),
-        dim_ranges_(std::move(dim_ranges)),
-        symbol_ranges_(std::move(symbol_ranges)) {
+        dim_vars_(std::move(dimensions)),
+        range_vars_(std::move(range_vars)),
+        rt_vars_(std::move(rt_vars)) {
     for (const auto& [expr, range] : constraints) {
       AddConstraint(expr, range);
     }
   }
-
-  IndexingMap(mlir::AffineMap affine_map, std::vector<Interval> dim_ranges,
-              std::vector<Interval> symbol_ranges,
+  IndexingMap(mlir::AffineMap affine_map, std::vector<DimVar> dimensions,
+              std::vector<RangeVar> range_vars, std::vector<RTVar> rt_vars,
               const llvm::DenseMap<mlir::AffineExpr, Interval>& constraints)
       : affine_map_(affine_map),
-        dim_ranges_(std::move(dim_ranges)),
-        symbol_ranges_(std::move(symbol_ranges)),
+        dim_vars_(std::move(dimensions)),
+        range_vars_(std::move(range_vars)),
+        rt_vars_(std::move(rt_vars)),
         constraints_(constraints) {}
 
   static IndexingMap GetUndefined() { return IndexingMap(); }
@@ -196,28 +245,46 @@ class IndexingMap {
 
   void Print(std::ostream& out, const AffineMapPrinter& printer) const;
 
+  // TODO(hebecker): Rearrange code structure so that we can call
+  // `ComputeInputToOutputIndexing` from `:indexing_analysis` directly.
+  using IndexingMapProvider = llvm::function_ref<IndexingMap(
+      const HloInstruction*, int64_t /*operand id*/, mlir::MLIRContext*)>;
+
   // Returns true if the map was simplified.
-  bool Simplify();
+  bool Simplify(IndexingMapProvider indexing_map_provider);
 
   // Return MLIRContext.
-  mlir::MLIRContext* GetMLIRContext() const { return affine_map_.getContext(); }
+  mlir::MLIRContext* GetMLIRContext() const;
 
   // Returns the affine map.
   mlir::AffineMap GetAffineMap() const { return affine_map_; }
 
-  // Getters for dimension ranges.
-  Interval GetDimensionRange(int64_t id) const { return dim_ranges_[id]; }
-  const std::vector<Interval>& GetDimensionRanges() const {
-    return dim_ranges_;
-  }
-  int64_t GetDimensionCount() const { return dim_ranges_.size(); }
-
-  // Getters for symbol ranges.
-  Interval GetSymbolRange(int64_t id) const { return symbol_ranges_[id]; }
-  const std::vector<Interval>& GetSymbolRanges() const {
-    return symbol_ranges_;
-  }
-  int64_t GetSymbolCount() const { return symbol_ranges_.size(); }
+  // Getters for dimension vars.
+  const DimVar& GetDimVars(int64_t id) const { return dim_vars_[id]; }
+  const std::vector<DimVar>& GetDimVars() const { return dim_vars_; }
+  int64_t GetDimVarsCount() const { return dim_vars_.size(); }
+
+  // Getters for range vars.
+  const RangeVar& GetRangeVar(int64_t id) const { return range_vars_[id]; }
+  const std::vector<RangeVar>& GetRangeVars() const { return range_vars_; }
+  int64_t GetRangeVarsCount() const { return range_vars_.size(); }
+
+  // Getters for runtime vars.
+  const RTVar& GetRTVar(int64_t id) const { return rt_vars_[id]; }
+  const std::vector<RTVar>& GetRTVars() const { return rt_vars_; }
+  int64_t GetRTVarsCount() const { return rt_vars_.size(); }
+
+  // Gets bounds of `affine_map_` dimensions.
+  const Interval& GetDimensionBound(int64_t dim_id) const;
+  Interval& GetMutableDimensionBound(int64_t dim_id);
+  std::vector<Interval> GetDimensionBounds() const;
+  int64_t GetDimensionCount() const { return affine_map_.getNumDims(); }
+
+  // Gets bounds of `affine_map_` symbols.
+  const Interval& GetSymbolBound(int64_t symbol_id) const;
+  Interval& GetMutableSymbolBound(int64_t symbol_id);
+  std::vector<Interval> GetSymbolBounds() const;
+  int64_t GetSymbolCount() const { return affine_map_.getNumSymbols(); }
 
   // Getters for affine expression constraints.
   const llvm::DenseMap<mlir::AffineExpr, Interval>& GetConstraints() const {
@@ -254,6 +321,10 @@ class IndexingMap {
   // Removes unused symbols from the `affine_map_` and constraints.
   void RemoveUnusedSymbols();
 
+  // Rescales all symbols that are sufficiently constrained through `s? mod x =
+  // [N, N]` constraints. Returns true if a rescale took place, otherwise false.
+  bool RescaleSymbols();
+
  private:
   IndexingMap() = default;
 
@@ -265,9 +336,17 @@ class IndexingMap {
   // Returns true if simplification was performed.
   bool SimplifyConstraintRanges();
 
+  // Merges "mod" constraints for the same AffineExpr.
+  void MergeModConstraints();
+
+  // Replace RTVars that yield constants by indexing expressions.
+  // Returns true if a replacement was performed, otherwise false.
+  bool ReplaceConstantRTVars(IndexingMapProvider indexing_map_provider);
+
   mlir::AffineMap affine_map_;
-  std::vector<Interval> dim_ranges_;
-  std::vector<Interval> symbol_ranges_;
+  std::vector<DimVar> dim_vars_;
+  std::vector<RangeVar> range_vars_;
+  std::vector<RTVar> rt_vars_;
   // Inequality constraints for affine expressions. They restrict the feasible
   // set for the domain of the indexing map. It contains affine expressions
   // other than AffineDimExpr and AffineSymbolExpr.
@@ -286,8 +365,8 @@ H AbslHashValue(H h, const IndexingMap& indexing_map) {
   llvm::hash_code affine_map_hash =
       llvm::hash_combine(indexing_map.GetAffineMap());
   return H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
-                    indexing_map.GetDimensionRanges(),
-                    indexing_map.GetSymbolRanges(),
+                    indexing_map.GetDimVars(), indexing_map.GetRangeVars(),
+                    indexing_map.GetRTVars(),
                     indexing_map.GetConstraintsCount());
 }
 
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index 2e7cbd309b1489..2b250db7871099 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -15,15 +15,22 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_map.h"
 
+#include <cstdint>
 #include <optional>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal_util.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/shape_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
@@ -31,6 +38,7 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using ::mlir::AffineMap;
 using ::testing::ElementsAre;
 
 class IndexingMapTest : public HloTestBase {
@@ -39,6 +47,36 @@ class IndexingMapTest : public HloTestBase {
   AffineMapPrinter printer_;
 };
 
+TEST_F(IndexingMapTest, RTVar) {
+  auto zero_dim_map = AffineMap::get(&mlir_context_);
+  std::vector<RTVar> rt_vars{RTVar{Interval{0, 2},
+                                   /*instr=*/nullptr, zero_dim_map},
+                             RTVar({Interval{0, 7},
+                                    /*instr=*/nullptr, zero_dim_map})};
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0, d1)[s0, s1, s2] -> (d1, d0, s0 + s1, s1)",
+                     &mlir_context_),
+      {DimVar{{0, 99}}, DimVar{{0, 43}}}, {RangeVar{{-99, 99}}},
+      std::move(rt_vars));
+  printer_.SetSymbolName(0, "range");
+  printer_.SetSymbolName(1, "rt_0");
+  printer_.SetSymbolName(2, "rt_1");
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0, d1)[range, rt_0, rt_1] -> (d1, d0, range + rt_0, rt_0)
+              domain:
+              d0 in [0, 99]
+              d1 in [0, 43]
+              range in [-99, 99]
+              rt_0 in [0, 2]
+                hlo: NULL
+                () -> ()
+              rt_1 in [0, 7]
+                hlo: NULL
+                () -> ()
+              )"));
+}
+
 TEST_F(IndexingMapTest, Evaluation) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
@@ -130,14 +168,14 @@ TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
                           s0 mod 3 in [1, 1]
                           s2 mod 4 in [0, 0]
                         )"));
-  composed.Simplify();
+  composed.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(composed, MatchIndexingMap(R"(
                           (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
                           domain:
-                          d0 in [0, 9]
-                          s0 in [0, 69]
+                          d0 in [0, 8]
+                          s0 in [1, 67]
                           s1 in [0, 19]
-                          s2 in [0, 7]
+                          s2 in [0, 4]
                           d0 mod 8 in [0, 0]
                           s0 mod 3 in [1, 1]
                           s2 mod 4 in [0, 0]
@@ -202,6 +240,32 @@ TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
                             )"));
 }
 
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithRTVars) {
+  auto zero_dim_map = AffineMap::get(&mlir_context_);
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
+                     &mlir_context_),
+      {DimVar{{0, 31}}}, {RangeVar{{0, 0}}, RangeVar{{0, 1}}, RangeVar{{0, 2}}},
+      {RTVar{Interval{0, 3},
+             /*instr=*/nullptr, zero_dim_map},
+       RTVar{Interval{0, 4},
+             /*instr=*/nullptr, zero_dim_map}});
+  indexing_map.AddConstraint(
+      ParseAffineExpr("d0 * 4 + s1 + s3", &mlir_context_), Interval{24, 459});
+  indexing_map.RemoveUnusedSymbols();
+  // Symbols s0, s2, s4 will be removed and s1 and s3 will become s0 and s1.
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                              (d0)[s0, s1] -> (d0 * 4 + s0 + s1 - 42)
+                              domain:
+                              d0 in [0, 31]
+                              s0 in [0, 1]
+                              s1 in [0, 3]
+                                hlo: NULL
+                                () -> ()
+                              d0 * 4 + s0 + s1 in [24, 459]
+                            )"));
+}
+
 TEST_F(IndexingMapTest, ConstraintIntervalSimplification_Sum) {
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
@@ -235,7 +299,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivPositiveDivisorNegativeBounds) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv 3", &mlir_context_),
                              Interval{-11, -5});
@@ -251,7 +315,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_FloorDivNegativeDivisorNegativeBounds) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv -3", &mlir_context_),
                              Interval{-11, -5});
@@ -281,7 +345,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulPositiveMultiplierNegativeBounds) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * 3", &mlir_context_),
                              Interval{-11, -5});
@@ -297,7 +361,7 @@ TEST_F(IndexingMapTest,
        ConstraintIntervalSimplification_MulNegativeMultiplierNegativeBounds) {
   IndexingMap indexing_map =
       IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
-                  {Interval{0, 99}}, {Interval{-99, 99}});
+                  {DimVar{{0, 99}}}, {RangeVar{{-99, 99}}}, /*rt_vars=*/{});
 
   indexing_map.AddConstraint(ParseAffineExpr("s0 * -3", &mlir_context_),
                              Interval{-11, -5});
@@ -309,10 +373,38 @@ TEST_F(IndexingMapTest,
                         )"));
 }
 
+TEST_F(IndexingMapTest, ConstraintMerge_Mod) {
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0, s1] -> (d0, s1, s0)", &mlir_context_),
+      {DimVar{{0, 4}}}, {RangeVar{{-21, -1}}, RangeVar{{0, 10}}},
+      /*rt_vars=*/{});
+  indexing_map.AddConstraint(ParseAffineExpr("d0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s1 mod 5", &mlir_context_),
+                             Interval{1, 1});
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0, s1] -> (d0, s1, s0)
+                          domain:
+                          d0 in [0, 3]
+                          s0 in [-18, -6]
+                          s1 in [1, 6]
+                          d0 mod 3 in [0, 0]
+                          s0 mod 6 in [0, 0]
+                          s1 mod 5 in [1, 1]
+                        )"));
+}
+
 TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
-  IndexingMap indexing_map = IndexingMap(
-      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {Interval{5, 5}}, {});
-  indexing_map.Simplify();
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0) -> (d0)", &mlir_context_),
+                  {DimVar{{5, 5}}}, /*range_vars=*/{}, /*rt_vars=*/{});
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0) -> (5)
                                                   domain:
@@ -325,7 +417,7 @@ TEST_F(IndexingMapTest,
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1) -> (d0, d1)
                                                   domain:
@@ -342,7 +434,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
 
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                   (d0, d1, d2) -> (d0, d1, d2)
@@ -361,7 +453,7 @@ TEST_F(IndexingMapTest,
 
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
     (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2,
                      (d1 * 4 + d2) mod 8)
@@ -378,7 +470,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  (d0, d1) -> (d0, d1)
                                                  domain:
@@ -392,7 +484,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 128) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> (s0 * 128)
       domain: s0 in [0, 127]
@@ -405,7 +497,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_SimplifyReshape_Regression) {
       "()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0] -> ((s0 * 128) mod 715 + ((s0 * 64) floordiv 715) * 715)
       domain: s0 in [0, 127]
@@ -418,7 +510,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
       "14)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
                                                  ()[s0] -> (s0)
                                                  domain:
@@ -432,7 +524,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
       "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
       domain:
@@ -448,7 +540,7 @@ TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
       "20000)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1, s2, s3] -> (
         s1 + (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000
@@ -468,7 +560,7 @@ TEST_F(IndexingMapTest,
       "* 2) floordiv 4)";
   IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap(serialized_map, &mlir_context_), {}, {2, 128});
-  indexing_map.Simplify();
+  indexing_map.Simplify(GetIndexingMapForInstruction);
   EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
       ()[s0, s1] -> (
         s0 * 4 + s1 floordiv 32
@@ -479,6 +571,85 @@ TEST_F(IndexingMapTest,
     )"));
 }
 
+TEST_F(IndexingMapTest, RescaleSymbols_Simple) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0 floordiv 6)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {7, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_WithShift) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {42, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{3, 3});
+
+  // [BEFORE] Allowed values for s0: 3, 9, 15, ..., 39 = (6 * 6 + 3)
+  // [AFTER] Allowed values for s0: 0, 1, 2, ..., 6
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0 * 6 + 3)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 6]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_TwoModConstraints) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0 floordiv 6)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {7, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+    )"));
+}
+
+TEST_F(IndexingMapTest, RescaleSymbols_RescaledSymbolInOtherConstraint) {
+  auto serialized_map = "(d0)[s0, s1, s2] -> (s2, d0, s1, s0)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {4}, {10, 2, 6});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 6", &mlir_context_),
+                             Interval{3, 3});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 * s2", &mlir_context_),
+                             Interval{0, 28});
+
+  EXPECT_TRUE(indexing_map.RescaleSymbols());
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      (d0)[s0, s1, s2] -> (s2, d0, s1, s0 * 6 + 3)
+      domain:
+        d0 in [0, 3]
+        s0 in [0, 1]
+        s1 in [0, 1]
+        s2 in [0, 5]
+        (s0 * 6 + 3) * s2 in [0, 28]
+    )"));
+}
+
 TEST_F(IndexingMapTest, RangeEvaluatorTest) {
   RangeEvaluator range_evaluator(
       {Interval{0, 9}, Interval{-10, -1}, Interval{-1, 2}, Interval{0, 0}}, {},
@@ -538,6 +709,211 @@ TEST(IntervalComparisionTest, Comparisons) {
   EXPECT_EQ(point != 16, true);
 }
 
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ScalarConstant) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant =
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(42));
+
+  IndexingMap indexing_map(ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
+                           /*dimensions=*/{},
+                           /*range_vars=*/{},
+                           {RTVar{Interval{42, 42}, constant.get(),
+                                  AffineMap::get(0, 0, {}, &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              () -> (42)
+              domain:
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_StaticIndexIntoTensorConstant) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+
+  IndexingMap indexing_map(
+      ParseAffineMap("()[s0] -> (s0)", &mlir_context_),
+      /*dimensions=*/{},
+      /*range_vars=*/{},
+      {RTVar{Interval{1, 14}, constant.get(),
+             ParseAffineMap("() -> (1,2)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              () -> (13)
+              domain:
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_NonFoldableTensor) {
+  // auto zero_dim_map = AffineMap::get(&mlir_context_);
+  auto constant = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2<uint32_t>({{1, 2, 3, 4}, {11, 12, 13, 14}}));
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (s0)", &mlir_context_),
+      /*dimensions=*/{},
+      /*range_vars=*/{},
+      {RTVar{Interval{1, 14}, constant.get(),
+             ParseAffineMap("(d0) -> (1, d0)", &mlir_context_)}});
+
+  EXPECT_FALSE(indexing_map.Simplify(GetIndexingMapForInstruction));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Iota) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, d0)
+              domain:
+              d0 in [0, 255]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_IotaAsConstant) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 1);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, 7)
+              domain:
+              d0 in [0, 255]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ConstraintsGetUpdated) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {10, 10}), 0);
+
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 255}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 9}, iota.get(),
+             ParseAffineMap("(d0) -> (d0, 7)", &mlir_context_)}});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 2", &mlir_context_),
+                             Interval{0, 0});
+
+  EXPECT_TRUE(indexing_map.Simplify(GetIndexingMapForInstruction));
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, d0)
+              domain:
+              d0 in [0, 254]
+              d0 mod 2 in [0, 0]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_Broadcast) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
+  auto transpose = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {32, 12}), iota.get(), {1});
+
+  // (d0, 11): d0 maps into the broadcasted dimension, so it doesn't matter
+  // and 11 maps to 11 in iota.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 31}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 11}, transpose.get(),
+             ParseAffineMap("(d0) -> (d0, 11)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, 11)
+              domain:
+              d0 in [0, 31]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_ChainedNoncomputeOps) {
+  auto iota = HloInstruction::CreateIota(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), 0);
+  auto reverse = HloInstruction::CreateReverse(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {12}), iota.get(), {0});
+  auto reshape = HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {3, 4}), reverse.get());
+  auto broadcast = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {36, 3, 4}), reshape.get(),
+      {1, 2});
+
+  // - Iota: [0, 1, ,,,, 11]
+  // - Reverse: [11, 10, ..., 0]
+  // - Reshape: [[11, 10, 9, 8], [7, 6, 5, 4], [3, 2, 1, 0]]
+  // - Coordinates: (d0 floordiv 12, 3)
+  // - y-coordinate=3 means we index into [8, 4, 0]
+  // - x-coordinate=(d0 floordiv 12) means our constant looks like this:
+  //   [8, ..., 8, 4, ..., 4, 0, ..., 0]
+  // - Hence our final expression: (d0 floordiv 12) * -4 + 8
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 35}},
+      /*range_vars=*/{},
+      {RTVar{
+          Interval{0, 11}, broadcast.get(),
+          ParseAffineMap("(d0) -> (d0, d0 floordiv 12, 3)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0) -> (d0, (d0 floordiv 12) * -4 + 8)
+              domain:
+              d0 in [0, 35]
+              )"));
+}
+
+TEST_F(IndexingMapTest, ReplaceConstantRTVars_PartialRTVarRemoval) {
+  auto iota = HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<int64_t>({1, 7, 25, 1, 7, 25, 1, 7, 25, 1, 7, 25}));
+  auto broadcast = HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(PrimitiveType::S64, {24, 12}), iota.get(), {1});
+
+  // (d0, d0 floordiv 2): d0 maps into the broadcasted dimension, so it can't be
+  // removed, but d0 floordiv 2 doesn't yield an affine expression so we need to
+  // keep the RTVar, but can optimize it by removing the broadcast.
+  IndexingMap indexing_map(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
+      /*dimensions=*/{{0, 23}},
+      /*range_vars=*/{},
+      {RTVar{Interval{0, 512}, broadcast.get(),
+             ParseAffineMap("(d0) -> (d0, d0 floordiv 2)", &mlir_context_)}});
+
+  indexing_map.Simplify(GetIndexingMapForInstruction);
+
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+              (d0)[s0] -> (d0, s0)
+              domain:
+              d0 in [0, 23]
+              s0 in [0, 512]
+                hlo: %constant = s64[12]{0} constant({...})
+                (d0) -> (d0 floordiv 2)
+              )"));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
new file mode 100644
index 00000000000000..98ed3ecc4dff4f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.cc
@@ -0,0 +1,364 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile.h"
+
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_map.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::mlir::AffineExpr;
+using ::mlir::AffineExprKind;
+using ::mlir::AffineMap;
+using ::mlir::getAffineConstantExpr;
+using ::mlir::getAffineDimExpr;
+using ::mlir::MLIRContext;
+
+// Internal helper that checks whether an affine map describes a tileable space.
+// In simple terms, this currently returns true if "dimensions don't mix", i.e.,
+// every result expression only refers to a single dimension (or symbol).
+//
+// TODO(b/328427138): this is too restrictive for expressions involving e.g.
+// (output-to-input) split reshapes, where several symbols may appear within the
+// same expression but still yield a tileable space. This will be handled in a
+// forthcoming change.
+bool IndexingMapDescribesTileableSpace(const IndexingMap& indexing_map) {
+  for (AffineExpr result_expr : indexing_map.GetAffineMap().getResults()) {
+    // Using a simple integer here might be overly restrictive, since there may
+    // be cases where the same symbol appears in several places within the
+    // expression. It is a bit unclear whether this is a case that would happen
+    // in practice and whether we would be able to handle it well in all cases
+    // if it did. For that reason, we err on the side of conservatism and
+    // explicitly do not support such cases.
+    int64_t num_hits = 0;
+    result_expr.walk([&num_hits](AffineExpr expr) {
+      if (expr.getKind() == AffineExprKind::SymbolId ||
+          expr.getKind() == AffineExprKind::DimId) {
+        ++num_hits;
+      }
+    });
+
+    if (num_hits > 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Helper to perform function application to using the same parameter for every
+// dimension and symbol parameter.
+AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(AffineMap affine_map,
+                                                           AffineExpr value) {
+  MLIRContext* mlir_context = affine_map.getContext();
+  int64_t num_dims = affine_map.getNumDims();
+  int64_t num_symbols = affine_map.getNumSymbols();
+  llvm::DenseMap<AffineExpr, AffineExpr> indices;
+
+  for (int64_t i = 0; i < num_dims; ++i) {
+    indices[getAffineDimExpr(i, mlir_context)] = value;
+  }
+
+  for (int64_t i = 0; i < num_symbols; ++i) {
+    indices[getAffineSymbolExpr(i, mlir_context)] = value;
+  }
+
+  return simplifyAffineMap(affine_map.replace(indices, num_dims, num_symbols));
+}
+
+struct SizeAndStrideExpression {
+  AffineExpr size;
+  AffineExpr stride;
+};
+
+// Converts a dimension expression to a symbol expression with the corresponding
+// index.
+AffineExpr ToSymbol(mlir::AffineDimExpr dim_expr) {
+  return mlir::getAffineSymbolExpr(dim_expr.getPosition(),
+                                   dim_expr.getContext());
+}
+
+// Extracts size and stride expressions from the operands to a modulo
+// expression.
+//
+// TODO(b/326998704): Currently, this fails when the stride is not exactly unit.
+std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromMod(
+    AffineExpr lhs, AffineExpr modulus) {
+  // TODO(b/326998704): derive constraints here, as well as the non-one stride
+  // case, both in the code and in the proof.
+  // Let f(d0) = d0 mod c. Then, given an input tile size n,
+  // {f(x) | x in Fin(n)} contains:
+  //   * n elements if n < c (and we add a constraint such that c | n);
+  //   * c elements if n >= c (and we add a constraint such that n | c).
+  // Given these constraints and assumptions, we derive
+  //   card({f(x) | x in Fin(n)}) = n - ((n - 1) floordiv n) * n.
+  // Proof:
+  //   * n < c (and c | n):
+  //       n - ((n - 1) floordiv c) * c
+  //     = n - 0 * c               (n < c => n floordiv c == 0)
+  //     = n
+  //   * n >= c (and n | c):
+  //       n - ((n - 1) floordiv c) * c
+  //     = n - (n / c - 1) * c     (n | c => (n - 1) floordiv c = n / c - 1)
+  //     = n - (n - c)
+  //     = c
+  CHECK(modulus.getKind() == AffineExprKind::Constant);
+  if (auto dim_expr = llvm::dyn_cast<mlir::AffineDimExpr>(lhs)) {
+    AffineExpr sym = ToSymbol(dim_expr);
+    AffineExpr size = sym - mlir::getAffineBinaryOpExpr(
+                                AffineExprKind::FloorDiv, sym - 1, modulus) *
+                                modulus;
+    // In this case, stride is effectively 1 mod modulus = 1.
+    return SizeAndStrideExpression{
+        size, /*stride=*/getAffineConstantExpr(1, lhs.getContext())};
+  }
+
+  return std::nullopt;
+}
+
+// Extracts size and stride expressions from the operands to a floordiv
+// expression.
+//
+// TODO(b/326998704): Currently, this fails when the numerator of the stride
+// is not exactly unit.
+std::optional<SizeAndStrideExpression> ExtractSizeAndStrideFromFloorDiv(
+    AffineExpr num, AffineExpr den) {
+  if (den.getKind() != AffineExprKind::Constant) {
+    return std::nullopt;
+  }
+
+  if (auto dim_expr = llvm::dyn_cast<mlir::AffineDimExpr>(num)) {
+    // Let f(d0) = d0 floordiv c. Then, given an input tile size n,
+    // {f(x) | x in Fin(n)} contains n ceildiv c elements, with stride
+    // (1 ceildiv c) = 1.
+    //
+    // We represent `a ceildiv b` as `(a + b - 1) floordiv b`, since indexing
+    // maps are not compatible with CeilDiv affine expressions.
+    AffineExpr size = mlir::getAffineBinaryOpExpr(
+        AffineExprKind::FloorDiv, ToSymbol(dim_expr) + (den - 1), den);
+    return SizeAndStrideExpression{
+        size, /*stride=*/getAffineConstantExpr(1, num.getContext())};
+  }
+
+  return std::nullopt;
+}
+
+std::optional<SizeAndStrideExpression> ExtractSizeAndStride(
+    AffineExpr strided_indexing, absl::Span<Interval const> symbol_intervals) {
+  MLIRContext* ctx = strided_indexing.getContext();
+  // Deal with the symbol case (capturing a whole untiled dimension).
+  // TODO(b/330906085): concatenating across a reduction dimension needs to be
+  // handled by this code.
+  if (auto symbol = llvm::dyn_cast<mlir::AffineSymbolExpr>(strided_indexing)) {
+    const Interval& symbol_interval = symbol_intervals[symbol.getPosition()];
+    if (symbol_interval.lower != 0) {
+      return std::nullopt;
+    }
+
+    return SizeAndStrideExpression{
+        /*size=*/getAffineConstantExpr(symbol_interval.upper + 1, ctx),
+        /*stride=*/getAffineConstantExpr(1, ctx)};
+  }
+
+  AffineMapPrinter printer;
+
+  // TODO(b/328427138): support multivariate size expressions.
+  switch (strided_indexing.getKind()) {
+    case AffineExprKind::DimId:
+      return SizeAndStrideExpression{
+          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(strided_indexing)),
+          /*stride=*/getAffineConstantExpr(1, ctx)};
+    case mlir::AffineExprKind::Mul: {
+      const auto mul = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      AffineExpr lhs = mul.getLHS();
+      // The stride may not be fully collapsed if it is negative; in that case,
+      // we need to extract the negative multiplier first.
+      if (const auto rhs =
+              llvm::dyn_cast<mlir::AffineConstantExpr>(mul.getRHS());
+          rhs && rhs.getValue() == -1) {
+        std::optional<SizeAndStrideExpression> maybe_size_and_stride =
+            ExtractSizeAndStride(lhs, symbol_intervals);
+        if (!maybe_size_and_stride.has_value()) {
+          return std::nullopt;
+        }
+
+        return SizeAndStrideExpression{
+            /*size=*/maybe_size_and_stride->size,
+            /*stride=*/maybe_size_and_stride->stride * rhs};
+      }
+      CHECK(lhs.getKind() == AffineExprKind::DimId);
+      return SizeAndStrideExpression{
+          /*size=*/ToSymbol(llvm::cast<mlir::AffineDimExpr>(lhs)),
+          /*stride=*/mul.getRHS()};
+    }
+    case mlir::AffineExprKind::Mod: {
+      auto mod = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      return ExtractSizeAndStrideFromMod(mod.getLHS(), mod.getRHS());
+    }
+    case mlir::AffineExprKind::FloorDiv: {
+      auto floor_div = llvm::cast<mlir::AffineBinaryOpExpr>(strided_indexing);
+      return ExtractSizeAndStrideFromFloorDiv(floor_div.getLHS(),
+                                              floor_div.getRHS());
+    };
+    case mlir::AffineExprKind::Constant:
+      return SizeAndStrideExpression{/*size=*/getAffineConstantExpr(1, ctx),
+                                     /*stride=*/getAffineConstantExpr(0, ctx)};
+    case mlir::AffineExprKind::SymbolId:
+      VLOG(1) << "Encountered complex size expression involving symbol "
+              << printer.ToString(strided_indexing);
+      return std::nullopt;
+    case mlir::AffineExprKind::Add:
+      // TODO(b/328427138): this should only be necessary in the multivariate
+      // case, and will be implemented later.
+      VLOG(1) << "Encountered complex strided indexing expression "
+              << printer.ToString(strided_indexing);
+      return std::nullopt;
+    case mlir::AffineExprKind::CeilDiv:
+      break;
+  };
+  LOG(FATAL) << "unreachable";
+}
+
+}  // anonymous namespace
+
+/*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
+    const IndexingMap& indexing_map) {
+  // Bail out on runtime offsets.
+  if (indexing_map.GetRTVarsCount()) {
+    return std::nullopt;
+  }
+  // TODO(b/328427138): handle multiple symbols in a single tile to support
+  // merging dimensions.
+  if (!IndexingMapDescribesTileableSpace(indexing_map)) {
+    return std::nullopt;
+  }
+
+  AffineMap input_affine_map = indexing_map.GetAffineMap();
+  MLIRContext* mlir_context = input_affine_map.getContext();
+
+  // If indexing_map describes a tileable space, then input_affine_map can be
+  // expressed as
+  //   f(dim0, ..., dim{M-1})[sym0, ..., sym{P-1}] = (expr0, ..., expr{N-1})
+  // where the result expressions expr0, ..., expr{N-1} are strided expressions
+  // of the form
+  //     offset_expr{i} + stride_expr{i} * index_expr{i}
+  // with 0 <= i < N.
+  //
+  // We are interested in extracting expressions for offset_expr{i},
+  // stride_expr{i}, and size_expr{i} (the count of different values that
+  // expr{i} can represent).
+  //
+  // We have that the following equations hold:
+  //
+  // (1) f(0, ..., 0)[0, ..., 0]{i}
+  //   = offset_expr{i} + stride_expr{i} * 0
+  //   = offset_expr{i}
+  //
+  // (2) f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}]{i} - f(0, ..., 0)[0, ..., 0]{i}
+  //   = offset_expr{i} + stride_expr{i} * index_expr{i} - offset_expr{i}
+  //   = stride_expr{i} * index_expr{i}
+  //
+  // offset_expressions = f(0, ..., 0)[0, ..., 0].
+  std::vector<AffineExpr> offset_expressions =
+      SubstituteAllIndicesAndKnownSymbolsWithSameValue(
+          input_affine_map, getAffineConstantExpr(0, mlir_context))
+          .getResults();
+
+  std::vector<AffineExpr> size_expressions;
+  std::vector<AffineExpr> stride_expressions;
+  size_expressions.reserve(offset_expressions.size());
+  stride_expressions.reserve(offset_expressions.size());
+
+  // strided_indexing_expressions =
+  //     f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}] - offset_expressions
+  for (auto [composite_indexing, offset] :
+       llvm::zip(input_affine_map.getResults(), offset_expressions)) {
+    std::optional<SizeAndStrideExpression> maybe_size_and_stride =
+        ExtractSizeAndStride(composite_indexing - offset,
+                             indexing_map.GetSymbolBounds());
+    if (!maybe_size_and_stride.has_value()) {
+      return std::nullopt;
+    }
+    size_expressions.push_back(maybe_size_and_stride->size);
+    stride_expressions.push_back(maybe_size_and_stride->stride);
+  }
+
+  // Eliminate negative strides and recalculate offsets.
+  std::vector<AffineExpr> dim_replacements, sym_replacements;
+  for (auto [offset, size, stride] :
+       llvm::zip(offset_expressions, size_expressions, stride_expressions)) {
+    auto constant = llvm::dyn_cast<mlir::AffineConstantExpr>(stride);
+    if (!constant) {
+      AffineMapPrinter printer;
+      VLOG(1) << "Unexpected non-constant stride expression: "
+              << printer.ToString(stride);
+      return std::nullopt;
+    }
+    if (constant.getValue() < 0) {
+      offset = offset + size * stride - stride;
+      stride = -stride;
+    }
+  }
+
+  int64_t num_symbols = input_affine_map.getNumDims();
+  AffineMap offset_map =
+      AffineMap::get(0, num_symbols, offset_expressions, mlir_context);
+  AffineMap size_map =
+      AffineMap::get(0, num_symbols, size_expressions, mlir_context);
+  AffineMap stride_map =
+      AffineMap::get(0, num_symbols, stride_expressions, mlir_context);
+
+  return SymbolicTile(offset_map, size_map, stride_map);
+}
+
+std::string SymbolicTile::ToString(const AffineMapPrinter& printer) const {
+  std::string s;
+  std::stringstream ss(s);
+  Print(ss, printer);
+  return ss.str();
+}
+
+void SymbolicTile::Print(std::ostream& out,
+                         const AffineMapPrinter& printer) const {
+  out << "Symbolic tile with \n";
+  out << "\toffset_map: ";
+  printer.Print(out, offset_map_);
+  out << "\n\tsize_map: ";
+  printer.Print(out, size_map_);
+  out << "\n\tstride_map: ";
+  printer.Print(out, stride_map_);
+  out << "\n";
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
similarity index 75%
rename from third_party/xla/xla/service/gpu/model/tile_analysis.h
rename to third_party/xla/xla/service/gpu/model/symbolic_tile.h
index 5ccbe497aecebe..93dc5ae25b0318 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
-#define XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
 
 #include <optional>
 #include <ostream>
@@ -32,21 +32,22 @@ namespace gpu {
 // expressed as a strided expression
 //     offset + stride * iota(size)
 // with offset, stride, and size three integers, and iota the usual range
-// function. The size and offsets may never be negative.
+// function. These values may never be negative.
 //
 // A N-dimensional symbolic tile is a function from offsets, strides, and sizes
 // to a N-dimensional tile. It can be represented as three affine maps with
 // domain
-//     ()[offset0, size0, stride0, ... offset{M-1}, size{P-1}, stride{M-1}]
+//     ()[size0, ..., size{M-1}]
 // and respective co-domains
-//     (offset0', ..., offset'{N-1})     (offset_map())
-//     (size0', ..., size'{N-1})         (size_map())
-//     (stride0', ..., stride'{N-1})     (stride_map())
+//     (offset0, ..., offset{N-1})     (offset_map())
+//     (size0', ..., size'{N-1})       (size_map())
+//     (stride0, ..., stride{N-1})     (stride_map())
 // where maps respectively encode the offset, size, and stride component of
 // each strided expression in the tile. The parameters to the maps above are all
-// assumed to be non-negative, but results of stride_map() may be negative.
+// assumed to be strictly positive. The input offsets are assumed to be all 0s,
+// and the input strides are assumed to be all 1s.
 //
-// A symbolic tile with 3*M symbols and N results is constructed using an
+// A symbolic tile with M symbols and N results is constructed using an
 // `IndexingMap` with M input dimensions and N results. The construction of the
 // symbolic tile may fail if any one of the resulting expressions is not a
 // strided expression as described above.
@@ -74,11 +75,7 @@ class SymbolicTile {
       : offset_map_(offset_map), size_map_(size_map), stride_map_(stride_map) {}
 };
 
-// Prints symbolic_tile with triplet labels for each symbol.
-// i.e. a symbol si which corresponds to an offset will be labeled offseti.
-std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile);
-
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_MODEL_TILE_ANALYSIS_H_
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_H_
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
new file mode 100644
index 00000000000000..41852118a2c4c2
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.cc
@@ -0,0 +1,215 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/instruction_fusion.h"
+#include "xla/status.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::MLIRContext;
+using ::mlir::SmallVector;
+
+}  // namespace
+
+/*static*/ SymbolicTileAnalysisOrError SymbolicTileAnalysis::AnalyzeComputation(
+    const HloComputation& computation, MLIRContext* ctx) {
+  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions;
+  absl::flat_hash_map<std::pair<const HloInstruction*, IndexingMap>,
+                      TiledHloInstruction*>
+      tiled_hlo_instructions_map;
+
+  absl::flat_hash_map<TiledHloInstruction*, int64_t> topological_order;
+
+  std::function<std::variant<TiledHloInstruction*, FusionDecision>(
+      const HloInstruction*, IndexingMap)>
+      get_tiled_hlo_instruction;
+
+  // Create a new tiled hlo instruction or return existing instruction from
+  // cache for the given hlo and indexing map.
+  get_tiled_hlo_instruction = [&](const HloInstruction* hlo,
+                                  IndexingMap indexing_map)
+      -> std::variant<TiledHloInstruction*, FusionDecision> {
+    auto key = std::make_pair(hlo, indexing_map);
+
+    auto it = tiled_hlo_instructions_map.find(key);
+    if (it != tiled_hlo_instructions_map.end()) {
+      return it->second;
+    }
+
+    // Bail out on instructions that are known to cause problems down the
+    // line. This is not an inherent limitation of the approach, but simply
+    // issues to be resolved in the current implementation.
+    if (hlo->opcode() == HloOpcode::kDot ||
+        hlo->opcode() == HloOpcode::kReshape ||
+        hlo->opcode() == HloOpcode::kBitcast ||
+        hlo->opcode() == HloOpcode::kConcatenate) {
+      return FusionDecision{} << "Bailing out on " << hlo->ToString();
+    }
+
+    // Bail out on instructions that do not output a single array.
+    if (!hlo->shape().IsArray()) {
+      return FusionDecision{} << hlo->ToString()
+                              << " outputs more than a single array";
+    }
+
+    auto symbolic_tile = SymbolicTile::FromIndexingMap(indexing_map);
+    if (!symbolic_tile.has_value()) {
+      return FusionDecision{} << "Failed to compute symbolic tile for "
+                              << indexing_map.ToString() << " for HLO "
+                              << hlo->ToString();
+    }
+
+    tiled_hlo_instructions.push_back(std::make_unique<TiledHloInstruction>(
+        hlo, std::move(indexing_map), std::move(*symbolic_tile)));
+
+    auto tiled_hlo_instruction = tiled_hlo_instructions.back().get();
+
+    std::optional<HloInstructionIndexing> operands_indexing =
+        ComputeOutputToInputIndexing(tiled_hlo_instruction->hlo,
+                                     /*output_id=*/0, ctx);
+
+    if (!operands_indexing.has_value()) {
+      return FusionDecision{} << "Failed to compute operands indexing for "
+                              << tiled_hlo_instruction->hlo->ToString();
+    }
+
+    for (auto [operand, operand_indexing_map_set] :
+         llvm::zip(tiled_hlo_instruction->hlo->operands(),
+                   operands_indexing->indexing_maps)) {
+      CHECK_EQ(operand_indexing_map_set.size(), 1);
+
+      IndexingMap operand_indexing_map =
+          ComposeIndexingMaps(tiled_hlo_instruction->indexing_map,
+                              *operand_indexing_map_set.begin());
+
+      auto tiled_operand_or =
+          get_tiled_hlo_instruction(operand, std::move(operand_indexing_map));
+
+      if (auto fusion_decison =
+              std::get_if<FusionDecision>(&tiled_operand_or)) {
+        return *fusion_decison;
+      }
+
+      tiled_hlo_instruction->operands.push_back(
+          std::get<TiledHloInstruction*>(tiled_operand_or));
+    }
+
+    topological_order[tiled_hlo_instruction] = topological_order.size();
+    tiled_hlo_instructions_map.emplace(key, tiled_hlo_instruction);
+    return tiled_hlo_instruction;
+  };
+
+  const HloInstruction* root = computation.root_instruction();
+  auto tiled_root =
+      get_tiled_hlo_instruction(root, CreateIdentityMap(root->shape(), ctx));
+  if (auto* fusion_decision = std::get_if<FusionDecision>(&tiled_root)) {
+    return *fusion_decision;
+  }
+
+  // Order instructions in def-before-use order.
+  absl::c_sort(tiled_hlo_instructions, [&](const auto& i1, const auto& i2) {
+    return topological_order.at(i1.get()) < topological_order.at(i2.get());
+  });
+
+  return SymbolicTileAnalysis(std::move(tiled_hlo_instructions), ctx);
+}
+
+namespace {
+
+std::vector<int64_t> EvaluateTileMap(AffineMap affine_map,
+                                     absl::Span<int64_t const> parameters) {
+  CHECK_EQ(affine_map.getNumSymbols(), parameters.size());
+  CHECK_EQ(affine_map.getNumDims(), 0);
+
+  SmallVector<AffineExpr> symbol_replacements = llvm::to_vector(
+      llvm::map_range(parameters, [affine_map](const int64_t v) -> AffineExpr {
+        return mlir::getAffineConstantExpr(v, affine_map.getContext());
+      }));
+
+  AffineMap simplified_affine_map =
+      mlir::simplifyAffineMap(affine_map.replaceDimsAndSymbols(
+          /*dimReplacements=*/{}, symbol_replacements, /*numResultDims=*/0,
+          /*numResultSyms=*/0));
+
+  SmallVector<int64_t> results = llvm::to_vector(llvm::map_range(
+      simplified_affine_map.getResults(), [](AffineExpr result) -> int64_t {
+        return llvm::cast<mlir::AffineConstantExpr>(result).getValue();
+      }));
+
+  return std::vector<int64_t>(results.begin(), results.end());
+}
+
+}  // namespace
+
+std::vector<int64_t> SymbolicTileAnalysis::TileOffsets(
+    const TiledHloInstruction& tiled_hlo) const {
+  CHECK(tile_parameters_.has_value());
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.offset_map(),
+                         *tile_parameters_);
+}
+
+// TODO(bchetioui): remove dependency on stride and offset parameters.
+std::vector<int64_t> SymbolicTileAnalysis::TileSizes(
+    const TiledHloInstruction& tiled_hlo) const {
+  CHECK(tile_parameters_.has_value());
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.size_map(), *tile_parameters_);
+}
+
+std::vector<int64_t> SymbolicTileAnalysis::TileStrides(
+    const TiledHloInstruction& tiled_hlo) const {
+  CHECK(tile_parameters_.has_value());
+  return EvaluateTileMap(tiled_hlo.symbolic_tile.stride_map(),
+                         *tile_parameters_);
+}
+
+void SymbolicTileAnalysis::SetTileSizes(std::vector<int64_t> sizes) {
+  // TODO(bchetioui): CHECK num parameters somehow?
+  tile_parameters_ = std::vector(std::move(sizes));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
new file mode 100644
index 00000000000000..be9953fa0ff574
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis.h
@@ -0,0 +1,133 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+#define XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/gpu/model/symbolic_tile.h"
+#include "xla/service/instruction_fusion.h"
+
+namespace xla {
+namespace gpu {
+
+class SymbolicTileAnalysis;
+using SymbolicTileAnalysisOrError =
+    std::variant<SymbolicTileAnalysis, FusionDecision>;
+
+// A node in the tiled representation of an HLO computation. During tiling and
+// codegen an HLO instruction may need to be emitted multiple times with
+// different tiling parameters.
+struct TiledHloInstruction {
+  // Pointer to the original HLO instruction.
+  const HloInstruction* hlo;
+
+  // Indexing map from the computation root to this instruction output.
+  IndexingMap indexing_map;
+
+  // Symbolic tile derived from the indexing map.
+  SymbolicTile symbolic_tile;
+
+  // Operands of the instruction in the tiled computation graph.
+  std::vector<TiledHloInstruction*> operands;
+
+  TiledHloInstruction(const HloInstruction* hlo, IndexingMap indexing_map,
+                      SymbolicTile symbolic_tile)
+      : hlo(hlo),
+        indexing_map(std::move(indexing_map)),
+        symbolic_tile(std::move(symbolic_tile)) {}
+};
+
+// Constructs and holds symbolic tiles for all the instructions within a
+// computation. We may hold several different symbolic tiles for the same
+// instruction if the instruction is indexed in several different ways in order
+// to produce a single chunk of the output. In order to handle this properly,
+// we store a symbolic tile for each possible path starting from the root
+// instruction of the computation to the relevant instruction.
+class SymbolicTileAnalysis {
+ public:
+  // `InstructionPathFromRoot` allows representing a graph path from the root
+  // instruction of a computation up to one of its consumers. Each integer
+  // in the path represents the index of the operand edge to follow to reach
+  // the instruction, starting from the root instruction.
+  using InstructionPathFromRoot = std::vector<int>;
+
+  // Tries to construct a symbolic tile analysis from a computation. Returns
+  // a diagnostic if the construction fails for any reason.
+  static SymbolicTileAnalysisOrError AnalyzeComputation(
+      const HloComputation& computation, mlir::MLIRContext* ctx);
+
+  // Evaluates the tile offsets of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileOffsets(const TiledHloInstruction& tiled_hlo) const;
+  // Evaluates the tile sizes of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileSizes(const TiledHloInstruction& tiled_hlo) const;
+  // Evaluates the tile strides of an instruction from the analyzed computation
+  // following the provided path from the root. Tile parameters must have been
+  // set before calling this method.
+  std::vector<int64_t> TileStrides(const TiledHloInstruction& tiled_hlo) const;
+
+  // Populates input tile sizes. This is a prerequisite in order to extract
+  // concrete values using `TileOffsets`, `TileSizes`, and `TileStrides`.
+  void SetTileSizes(std::vector<int64_t> sizes);
+
+  // Returns the tiled root instruction.
+  const TiledHloInstruction* GetRoot() const {
+    return tiled_hlo_instructions_.back().get();
+  }
+
+  // Returns the tiled HLO instructions in def-before-use order.
+  const std::vector<std::unique_ptr<TiledHloInstruction>>&
+  GetTiledHloInstructions() const {
+    return tiled_hlo_instructions_;
+  }
+
+  // Return the underlying MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const { return context_; };
+
+ private:
+  SymbolicTileAnalysis(
+      std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions,
+      mlir::MLIRContext* context)
+      : tiled_hlo_instructions_(std::move(tiled_hlo_instructions)),
+        context_(context) {}
+
+  // The tiled HLO instructions in def-before-use order.
+  std::vector<std::unique_ptr<TiledHloInstruction>> tiled_hlo_instructions_;
+
+  mlir::MLIRContext* context_;
+  // Optionally set tile parameters. These parameters can be set by calling
+  // `SetTileParameters`, and correspond to the output tile for the analyzed
+  // computation. The order and type of parameters are as explained in the
+  // documentation of `SymbolicTile`.
+  std::optional<std::vector<int64_t>> tile_parameters_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_SYMBOLIC_TILE_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
new file mode 100644
index 00000000000000..324937a4ac8991
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_analysis_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile_analysis.h"
+
+#include <memory>
+#include <utility>
+#include <variant>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ElementsAre;
+using SymbolicTileAnalysisTest = HloTestBase;
+
+TEST_F(SymbolicTileAnalysisTest, SimpleNormalizationDiamondIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+max {
+  p1 = f32[] parameter(1)
+  p0 = f32[] parameter(0)
+  ROOT m = f32[] maximum(p0, p1)
+}
+
+ENTRY main {
+  p0 = f32[2,97]{1,0} parameter(0)
+  constant = f32[] constant(-inf)
+  reduce = f32[2] reduce(p0, constant), dimensions={1}, to_apply=max
+  broadcast = f32[2,97]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[2,97]{1,0} subtract(p0, broadcast)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+
+  ASSERT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  SymbolicTileAnalysis analysis =
+      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+
+  analysis.SetTileSizes(/*sizes=*/{1, 10});
+
+  const TiledHloInstruction* root = analysis.GetRoot();
+
+  auto p0_from_subtract0 = root->operands[0];
+  auto p0_from_subtract1 = root->operands[1]->operands[0]->operands[0];
+
+  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract0), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract0), ElementsAre(1, 10));
+  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract0), ElementsAre(1, 1));
+
+  EXPECT_THAT(analysis.TileOffsets(*p0_from_subtract1), ElementsAre(0, 0));
+  EXPECT_THAT(analysis.TileSizes(*p0_from_subtract1), ElementsAre(1, 97));
+  EXPECT_THAT(analysis.TileStrides(*p0_from_subtract1), ElementsAre(1, 1));
+}
+
+TEST_F(SymbolicTileAnalysisTest, ElementwiseDiamondCSEIsSupported) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[2,97] parameter(0)
+  exp = f32[2,97] exponential(p0)
+  log = f32[2,97] log(p0)
+  ROOT subtract = f32[2,97] subtract(exp, log)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+
+  EXPECT_TRUE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+  SymbolicTileAnalysis analysis =
+      std::get<SymbolicTileAnalysis>(std::move(analysis_or_error));
+
+  const TiledHloInstruction* root = analysis.GetRoot();
+
+  auto p0_from_subtract0 = root->operands[0]->operands[0];
+  auto p0_from_subtract1 = root->operands[1]->operands[0];
+
+  EXPECT_EQ(p0_from_subtract0, p0_from_subtract1);
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedDot) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  p1 = f32[2,3]{1,0} parameter(1)
+  ROOT dot = f32[1,3]{1,0} dot(p0, p1),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedReshape) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  ROOT reshape = f32[2] reshape(p0)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedBitcast) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,2]{1,0} parameter(0)
+  ROOT bitcast = f32[2] bitcast(p0)
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+TEST_F(SymbolicTileAnalysisTest, BailOutOnUnsupportedConcatenate) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY main {
+  p0 = f32[1,3]{1,0} parameter(0)
+  p1 = f32[1,3]{1,0} parameter(1)
+  ROOT concatenate = f32[2,3] concatenate(p0, p1), dimensions={0}
+})"));
+
+  mlir::MLIRContext mlir_ctx;
+  SymbolicTileAnalysisOrError analysis_or_error =
+      SymbolicTileAnalysis::AnalyzeComputation(*module->entry_computation(),
+                                               &mlir_ctx);
+  EXPECT_FALSE(std::holds_alternative<SymbolicTileAnalysis>(analysis_or_error));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
new file mode 100644
index 00000000000000..74673bd8f4db19
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/symbolic_tile_test.cc
@@ -0,0 +1,439 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/symbolic_tile.h"
+
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/gpu/model/affine_map_printer.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::ExplainMatchResult;
+using ::testing::Optional;
+using ::testing::StrEq;
+
+MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
+           stride_map_string,
+           absl::StrCat(negation
+                            ? "equals "
+                            : "doesn't equal symbolic tile with offset_map_ ",
+                        offset_map_string, " and size_map_ ", size_map_string,
+                        " and stride_map_ ", stride_map_string)) {
+  AffineMapPrinter printer;
+  return ExplainMatchResult(StrEq(offset_map_string),
+                            printer.ToString(arg.offset_map()),
+                            result_listener) &&
+         ExplainMatchResult(StrEq(size_map_string),
+                            printer.ToString(arg.size_map()),
+                            result_listener) &&
+         ExplainMatchResult(StrEq(stride_map_string),
+                            printer.ToString(arg.stride_map()),
+                            result_listener);
+}
+
+using SymbolicTileTest = IndexingTestBase;
+
+TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[11, 17, 19] parameter(0)
+      p1 = f32[11, 19, 23] parameter(1)
+      ROOT dot = f32[11, 17, 23] dot(p0, p1),
+        lhs_batch_dims={0}, rhs_batch_dims={0},
+        lhs_contracting_dims={2}, rhs_contracting_dims={1}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, 19)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, 19, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[11, 17, 19] parameter(0)
+      ROOT reshape = f32[1, 11, 17, 19] reshape(p0)
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2, s3] -> (0, 0, 0)",
+                                 "()[s0, s1, s2, s3] -> (s1, s2, s3)",
+                                 "()[s0, s1, s2, s3] -> (1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest,
+       CanPropagateTileThroughNonTrivialMergeReshapeFromOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT bitcast = f32[48,4]{1,0} bitcast(p0)
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1] -> (0, 0, 0, 0)",
+          "()[s0, s1] -> "
+          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
+          "()[s0, s1] -> (0, 1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, FailsToPropagateTileThroughNonTrivialReshape) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[12, 4, 19] parameter(0)
+      ROOT reshape = f32[4, 12, 19] reshape(p0)
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughElementwiseOp) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[150] parameter(0)
+      p1 = f32[150] parameter(1)
+      ROOT add = f32[150] add(p0, p1)
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0] -> (0)", "()[s0] -> (s0)",
+                                 "()[s0] -> (1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[150] parameter(0)
+      ROOT broadcast = f32[157,150] broadcast(p0), dimensions={1}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1] -> (0)", "()[s0, s1] -> (s1)",
+                                 "()[s0, s1] -> (1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+
+    ENTRY e {
+      p0 = f32[125,150] parameter(0)
+      c0 = f32[] constant(-inf)
+      ROOT reduce = f32[150] reduce(p0, c0), dimensions={0}, to_apply=max
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0] -> (0, 0)", "()[s0] -> (125, s0)",
+                                 "()[s0] -> (1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[179] parameter(0)
+      ROOT reverse = f32[179] reverse(p0), dimensions={0}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0] -> (-s0 + 179)", "()[s0] -> (s0)",
+                                 "()[s0] -> (1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[120,142] parameter(0)
+      ROOT slice = f32[10,21] slice(p0), slice={[40:60:2], [20:104:4]}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1] -> (40, 20)",
+                                 "()[s0, s1] -> (s0, s1)",
+                                 "()[s0, s1] -> (2, 4)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[21,10] parameter(0)
+      ROOT transpose = f32[10,21] transpose(p0), dimensions={1,0}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1] -> (0, 0)",
+                                 "()[s0, s1] -> (s1, s0)",
+                                 "()[s0, s1] -> (1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
+  // TODO(b/325488844): Add additional concat test cases with constraints.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[2,5,7] parameter(0)
+      p1 = f32[2,11,7] parameter(1)
+      p2 = f32[2,17,7] parameter(2)
+      ROOT concat = f32[2,33,7] concatenate(p0, p1, p2), dimensions={1}
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, 0, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -5, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[2].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, -16, 0)",
+                                 "()[s0, s1, s2] -> (s0, s1, s2)",
+                                 "()[s0, s1, s2] -> (1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
+  // TODO(b/325488844): Add pad tests with defined constraints on tile input.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 4] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[8,8] pad(p0, p1), padding=2_2_0x1_3_0
+    }
+  )"));
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile("()[s0, s1] -> (-2, -1)",
+                                 "()[s0, s1] -> (s0, s1)",
+                                 "()[s0, s1] -> (1, 1)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughSplitReshapeOfReverse) {
+  // A split reshape of a reverse creates a negative unit stride atop a
+  // floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      reverse = f32[1,8,6,4]{3,2,1,0} reverse(p0), dimensions={1,2}
+      ROOT bitcast = f32[48,4]{1,0} bitcast(reverse)
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[48,4]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  // TODO(b/331257678): the expected expressions should be simplified.
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1] -> (0, -((s0 + 5) floordiv 6) + 8, "
+          "-(s0 - ((s0 - 1) floordiv 6) * 6) + 6, 0)",
+          "()[s0, s1] -> "
+          "(1, (s0 + 5) floordiv 6, s0 - ((s0 - 1) floordiv 6) * 6, s1)",
+          "()[s0, s1] -> (0, 1, 1, 1)")));
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshape) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape creates a non-unit stride atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      bitcast = f32[48,4]{1,0} bitcast(p0)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughMisalignedSliceOfSplitReshape) {
+  // TODO(b/326998704): constraints should allow us to unblock part of this use
+  // case.
+  // TODO(b/331257678): handling correctly cases where offsets don't get
+  // simplified away perfectly will allow us to unblock part of this use case.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      bitcast = f32[48,4]{1,0} bitcast(p0)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[20:45:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshapeOnTranspose) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape creates a non-unit stride atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,6,8,4]{3,2,1,0} parameter(0)
+      transpose = f32[1,8,6,4]{3,2,1,0} transpose(p0), dimensions={0,2,1,3}
+      bitcast = f32[48,4]{1,0} bitcast(transpose)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,6,8,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughSliceOfSplitReshapeOfReverse) {
+  // TODO(b/326998704): constraints should allow us to unblock this use case.
+  // A slice of a split reshape of a reverse creates a negative non-unit stride
+  // atop a floordiv.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    computation {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      reverse = f32[1,8,6,4]{3,2,1,0} reverse(p0), dimensions={1,2}
+      bitcast = f32[48,4]{1,0} bitcast(reverse)
+      ROOT slice = f32[5,2]{1,0} slice(bitcast), slice={[18:43:5], [0:4:2]}
+    }
+
+    ENTRY e {
+      p0 = f32[1,8,6,4]{3,2,1,0} parameter(0)
+      ROOT fusion = f32[5,2]{1,0} fusion(p0), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      std::nullopt);
+}
+
+TEST_F(SymbolicTileTest,
+       FailsGracefullyAtPropagatingTileThroughReductionOfConcatenation) {
+  // TODO(b/330906085): concatenating across a reduction dimension needs to be
+  // handled to unblock this.
+  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
+    HloModule m
+    max_computation {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT maximum = f32[] maximum(p0, p1)
+    }
+
+    computation {
+      p0 = f32[10,8]{1,0} parameter(0)
+      p1 = f32[20,8]{1,0} parameter(1)
+      concatenate = f32[30,8]{1,0} concatenate(p0, p1), dimensions={0}
+      neg_inf = f32[] constant(-inf)
+      ROOT reduce = f32[8] reduce(concatenate, neg_inf), dimensions={0},
+        to_apply=max_computation
+    }
+
+    ENTRY e {
+      p0 = f32[10,8]{1,0} parameter(0)
+      p1 = f32[20,8]{1,0} parameter(1)
+      ROOT fusion = f32[8] fusion(p0, p1), kind=kLoop, calls=computation
+    }
+  )"));
+
+  EXPECT_EQ(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      std::nullopt);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
deleted file mode 100644
index 3599d19575e70e..00000000000000
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/model/tile_analysis.h"
-
-#include <cstdint>
-#include <iterator>
-#include <optional>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/AffineMap.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_map.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using absl::StrCat;
-using mlir::AffineDimExpr;
-using mlir::AffineExpr;
-using mlir::AffineMap;
-using mlir::AffineSymbolExpr;
-using mlir::getAffineConstantExpr;
-using mlir::getAffineDimExpr;
-using mlir::MLIRContext;
-using mlir::simplifyAffineExpr;
-
-// The number of tile parameters that are inserted for each input dimension when
-// constructing a symbolic tile from an indexing map.
-constexpr int kNumTileParametersPerInputDim = 3;
-
-// Internal helper that checks whether an affine map of the form
-//     (index0, ..., index{M-1})
-//       [sym0, ..., sym{P-1}, offset0, size0, stride0, ...,
-//        offset{M-1}, size{M-1}, stride{M-1}]
-//  -> (expr0, ..., expr{N-1})
-//
-// describes a symbolic tile. The documentation for
-// `RawSymbolicTileFromIndexingMap` explains what this means in details.
-bool AffineMapDescribesTile(AffineMap affine_map) {
-  int64_t num_known_symbols =
-      affine_map.getNumSymbols() -
-      kNumTileParametersPerInputDim * affine_map.getNumDims();
-  for (AffineExpr result_expr : affine_map.getResults()) {
-    int64_t num_hits = 0;
-    result_expr.walk([&num_hits, &num_known_symbols](AffineExpr expr) {
-      if (auto symbol_expr = llvm::dyn_cast<AffineSymbolExpr>(expr)) {
-        num_hits += (symbol_expr.getPosition() < num_known_symbols);
-      } else if (auto dim_expr = llvm::dyn_cast<AffineDimExpr>(expr)) {
-        ++num_hits;
-      }
-    });
-
-    if (num_hits > 1) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Internal helper to construct symbolic tiles. The only difference with an
-// actual symbolic tile is that this structure does not enforce the relevant
-// important invariants by construction.
-struct RawSymbolicTile {
-  AffineMap offset_map;
-  AffineMap size_map;
-  AffineMap stride_map;
-};
-
-// Helper to perform function applications as described in the documentation of
-// `RawSymbolicTileFromIndexingMap`.
-AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-    AffineMap affine_map, AffineExpr value, int64_t num_known_symbols) {
-  MLIRContext* mlir_context = affine_map.getContext();
-  int64_t num_input_dims = affine_map.getNumDims();
-  llvm::DenseMap<AffineExpr, AffineExpr> indices;
-
-  for (int64_t i = 0; i < num_input_dims; ++i) {
-    indices[getAffineDimExpr(i, mlir_context)] = value;
-  }
-
-  for (int64_t i = 0; i < num_known_symbols; ++i) {
-    indices[getAffineSymbolExpr(i, mlir_context)] = value;
-  }
-
-  return simplifyAffineMap(affine_map.replace(indices, affine_map.getNumDims(),
-                                              affine_map.getNumSymbols()));
-}
-
-// Extracts non-negative offset, size, and stride expression for each result
-// expression in the parameter affine map if the affine map describes a tile.
-// Returns `std::nullopt` if the parameter affine map does not describe a tile.
-//
-// The parameter affine map f must follow the pattern
-//     (index0, ..., index{M-1})
-//       [sym0, ..., sym{P-1}, offset0, size0, stride0, ...,
-//        offset{M-1}, size{M-1}, stride{M-1}]
-//  -> (expr0, ..., expr{N-1})
-// where the result expressions expr0, ..., expr{N-1} are strided expressions of
-// the form
-//     offset_expr{i} + stride_expr{i} * index_expr{i}
-// with 0 <= i < N, and index_expr{i} is either a symbol with known bound
-// (sym0, ..., sym{P-1}) or an index.
-//
-// Let f'(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}]
-//   = f(x0, ..., x{M-1})[x{M}, ..., x{M+P-1}, offset0, size0, stride0, ...,
-//                        offset{M-1}, size{M-1}, stride{M-1}]
-//
-// Then, the following equations hold:
-//
-// (1) f'(0, ..., 0)[0, ..., 0]{i}
-//   = offset_expr{i} + stride_expr{i} * 0
-//   = offset_expr{i}
-//
-// (2) f'(1, ..., 1)[1, ..., 1]{i} - f'(0, ..., 0)[0, ..., 0]{i}
-//   = offset_expr{i} + stride_expr{i} * 1 - offset_expr{i}
-//   = stride_expr{i}
-//
-// (3) If stride_expr{i} = 0, we automatically set size_expr{i} = 1. This
-//     happens when the strided expression points to a single value that is the
-//     same for all elements in the tile.
-//
-//     If stride_expr{i} != 0, then the relevant size expression can be obtained
-//     by analyzing the index expression, which is known to be either a symbol
-//     with known bound, or an index parameter. In the former case, we set the
-//     size to be the upper bound of the symbol; in the latter case, we
-//     substitute the index parameter by its corresponding size parameter.
-//
-// Strictly solving (1) may yield negative strides (e.g. in the case of
-// reverse). Conceptually, negative strides denote of a decremental iteration
-// order over indices {0, ..., size_expr{i} - 1}. This is not normalized in
-// symbolic tiles, and must be handled by consumers.
-//
-// The resulting affine maps elide known symbols from the list of parameter
-// symbols, since they will have been replaced by constants.
-std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
-    const IndexingMap& indexing_map) {
-  AffineMap affine_map = indexing_map.GetAffineMap();
-  if (!AffineMapDescribesTile(affine_map)) {
-    return std::nullopt;
-  }
-
-  MLIRContext* mlir_context = affine_map.getContext();
-  int64_t num_known_symbols =
-      affine_map.getNumSymbols() -
-      affine_map.getNumDims() * kNumTileParametersPerInputDim;
-  int64_t num_results = affine_map.getNumResults();
-
-  // offsets_expr = f'(0, ..., 0)[0, ..., 0]
-  AffineMap f_prime_0 = SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-      affine_map, getAffineConstantExpr(0, mlir_context), num_known_symbols);
-  llvm::ArrayRef<AffineExpr> offset_expressions = f_prime_0.getResults();
-
-  // Compute f'(1, ..., 1)[1, ..., 1].
-  AffineMap f_prime_1 = SubstituteAllIndicesAndKnownSymbolsWithSameValue(
-      affine_map, getAffineConstantExpr(1, mlir_context), num_known_symbols);
-
-  // strides_expr = f'(1, ..., 1)[1, ..., 1] - f'(0, ..., 0)[0, ..., 0]
-  std::vector<AffineExpr> stride_expressions;
-  stride_expressions.reserve(num_results);
-  for (auto [sub_lhs, sub_rhs] :
-       llvm::zip(f_prime_1.getResults(), offset_expressions)) {
-    stride_expressions.push_back(
-        simplifyAffineExpr(sub_lhs - sub_rhs, affine_map.getNumDims(),
-                           affine_map.getNumSymbols()));
-  }
-
-  // Deduce size_expr. At each index, if the stride is non-zero, once rid of
-  // the offset expression, the remaining expression can be one of two things;
-  //   1. a single parameter---either an index parameter, or a symbol with
-  //      known bounds. This parameter is the size expression, and in the
-  //      case of a symbol with known bounds, we can directly make it a
-  //      constant;
-  //   2. the product of an index parameter (or symbol with known bound)
-  //      with an expression consisting only of constants, and offsets
-  //      and strides parameters. In that case, the index parameter/symbol
-  //      with known bound is the size expression, and we do like in the
-  //      first bullet.
-  // This structure is guaranteed by the `AffineMapDescribesTile` filter
-  // at the top of the function.
-  std::vector<AffineExpr> size_expressions;
-  size_expressions.reserve(num_results);
-  constexpr int kSizePositionWithinTileParameters = 1;
-  for (auto [offset_expr, stride_expr, input_expr] : llvm::zip(
-           offset_expressions, stride_expressions, affine_map.getResults())) {
-    AffineExpr size_expr;
-    if (stride_expr == getAffineConstantExpr(0, mlir_context)) {
-      size_expr = getAffineConstantExpr(1, mlir_context);
-    } else {
-      AffineExpr strided_size_expr =
-          simplifyAffineExpr(input_expr - offset_expr, affine_map.getNumDims(),
-                             affine_map.getNumSymbols());
-
-      strided_size_expr.walk([&](AffineExpr expr) {
-        auto symbol_expr = llvm::dyn_cast<AffineSymbolExpr>(expr);
-        if (symbol_expr && symbol_expr.getPosition() < num_known_symbols) {
-          CHECK(!size_expr);
-          const Interval& symbol_range =
-              indexing_map.GetSymbolRange(symbol_expr.getPosition());
-          size_expr = getAffineConstantExpr(
-              symbol_range.upper - symbol_range.lower + 1, mlir_context);
-        } else if (auto dim_expr = llvm::dyn_cast<AffineDimExpr>(expr)) {
-          CHECK(!size_expr);
-          size_expr = getAffineSymbolExpr(
-              num_known_symbols +
-                  dim_expr.getPosition() * kNumTileParametersPerInputDim +
-                  kSizePositionWithinTileParameters,
-              mlir_context);
-        }
-      });
-    }
-    size_expressions.push_back(size_expr);
-  }
-
-  int64_t num_symbols = affine_map.getNumSymbols();
-  return RawSymbolicTile(
-      {.offset_map =
-           AffineMap::get(0, num_symbols, offset_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols),
-       .size_map =
-           AffineMap::get(0, num_symbols, size_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols),
-       .stride_map =
-           AffineMap::get(0, num_symbols, stride_expressions, mlir_context)
-               .shiftSymbols(-num_known_symbols)});
-}
-
-}  // anonymous namespace
-
-/*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
-    const IndexingMap& indexing_map) {
-  MLIRContext* mlir_context = indexing_map.GetAffineMap().getContext();
-  int64_t num_input_dims = indexing_map.GetDimensionCount();
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(num_input_dims);
-
-  std::vector<Interval> tile_dimension_ranges;
-  tile_dimension_ranges.reserve(num_input_dims);
-  std::vector<Interval> tile_symbol_ranges;
-  tile_symbol_ranges.reserve(kNumTileParametersPerInputDim * num_input_dims +
-                             indexing_map.GetAffineMap().getNumSymbols());
-
-  // The symbols declared in 'indexing_map.affine_map' will precede those
-  // defined in the producer map we construct here.
-  absl::c_copy(indexing_map.GetSymbolRanges(),
-               std::back_inserter(tile_symbol_ranges));
-
-  // For each input dims we add kNumTileParametersPerInputDim = 3 symbols, as
-  // well as a single dim. Symbols are ordered in (offset, size, stride)
-  // triplets.
-  for (int64_t dim = 0; dim < num_input_dims; ++dim) {
-    AffineExpr index = getAffineDimExpr(dim, mlir_context);
-    AffineExpr offset =
-        getAffineSymbolExpr(kNumTileParametersPerInputDim * dim, mlir_context);
-    AffineExpr stride = getAffineSymbolExpr(
-        kNumTileParametersPerInputDim * dim + 2, mlir_context);
-
-    exprs.push_back(offset + stride * index);
-
-    Interval range = indexing_map.GetDimensionRange(dim);
-    tile_dimension_ranges.push_back(range);
-
-    for (int64_t symbol_index = 0; symbol_index < kNumTileParametersPerInputDim;
-         ++symbol_index) {
-      tile_symbol_ranges.push_back(range);
-    }
-  }
-
-  AffineMap producer_map = AffineMap::get(
-      num_input_dims, kNumTileParametersPerInputDim * num_input_dims, exprs,
-      mlir_context);
-
-  IndexingMap composed_indexing_map(
-      indexing_map.GetAffineMap().compose(producer_map), tile_dimension_ranges,
-      tile_symbol_ranges);
-
-  composed_indexing_map.Simplify();
-
-  std::optional<RawSymbolicTile> maybe_raw_symbolic_tile =
-      RawSymbolicTileFromIndexingMap(composed_indexing_map);
-
-  if (!maybe_raw_symbolic_tile.has_value()) {
-    return std::nullopt;
-  }
-
-  return SymbolicTile(maybe_raw_symbolic_tile->offset_map,
-                      maybe_raw_symbolic_tile->size_map,
-                      maybe_raw_symbolic_tile->stride_map);
-}
-
-std::string SymbolicTile::ToString(const AffineMapPrinter& printer) const {
-  std::string s;
-  std::stringstream ss(s);
-  Print(ss, printer);
-  return ss.str();
-}
-
-void SymbolicTile::Print(std::ostream& out,
-                         const AffineMapPrinter& printer) const {
-  out << "Symbolic tile with \n";
-  out << "\toffset_map: ";
-  printer.Print(out, offset_map_);
-  out << "\n\tsize_map: ";
-  printer.Print(out, size_map_);
-  out << "\n\tstride_map: ";
-  printer.Print(out, stride_map_);
-  out << "\n";
-}
-
-std::ostream& operator<<(std::ostream& out, const SymbolicTile& symbolic_tile) {
-  AffineMapPrinter printer;
-
-  // This utilizes the assumption that symbols are structured as triplets, i.e.
-  // [offset0, size0, stride0, ... offset{N-1}, size{N-1}, stride{N-1}]
-  // where N is the tensor rank.
-  for (int64_t triplet_start = 0;
-       triplet_start < symbolic_tile.offset_map().getNumSymbols();
-       triplet_start += kNumTileParametersPerInputDim) {
-    int64_t triplet_idx = triplet_start / kNumTileParametersPerInputDim;
-    printer.SetSymbolName(triplet_start, StrCat("offset", triplet_idx));
-    printer.SetSymbolName(triplet_start + 1, StrCat("size", triplet_idx));
-    printer.SetSymbolName(triplet_start + 2, StrCat("stride", triplet_idx));
-  }
-
-  symbolic_tile.Print(out, printer);
-  return out;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
deleted file mode 100644
index da941e0d6a4ebb..00000000000000
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/model/tile_analysis.h"
-
-#include <optional>
-#include <sstream>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "xla/service/gpu/model/affine_map_printer.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
-#include "xla/service/gpu/model/indexing_test_utils.h"
-#include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using ::testing::ExplainMatchResult;
-using ::testing::HasSubstr;
-using ::testing::Optional;
-using ::testing::StrEq;
-
-MATCHER_P3(MatchSymbolicTile, offset_map_string, size_map_string,
-           stride_map_string,
-           absl::StrCat(negation
-                            ? "equals "
-                            : "doesn't equal symbolic tile with offset_map_ ",
-                        offset_map_string, " and size_map_ ", size_map_string,
-                        " and stride_map_ ", stride_map_string)) {
-  AffineMapPrinter printer;
-  return ExplainMatchResult(StrEq(offset_map_string),
-                            printer.ToString(arg.offset_map()),
-                            result_listener) &&
-         ExplainMatchResult(StrEq(size_map_string),
-                            printer.ToString(arg.size_map()),
-                            result_listener) &&
-         ExplainMatchResult(StrEq(stride_map_string),
-                            printer.ToString(arg.stride_map()),
-                            result_listener);
-}
-
-using SymbolicTileTest = IndexingTestBase;
-
-TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[11, 17, 19] parameter(0)
-      p1 = f32[11, 19, 23] parameter(1)
-      ROOT dot = f32[11, 17, 23] dot(p0, p1),
-        lhs_batch_dims={0}, rhs_batch_dims={0},
-        lhs_contracting_dims={2}, rhs_contracting_dims={1}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, 0)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, 19)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, 1)")));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, 0, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, 19, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, 1, s8)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[11, 17, 19] parameter(0)
-      ROOT reshape = f32[1, 11, 17, 19] reshape(p0)
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s3, s6, s9)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s4, s7, s10)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
-          "-> (s5, s8, s11)")));
-}
-
-TEST_F(SymbolicTileTest, FailsToPropagateTileThroughReshape) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[12, 4, 19] parameter(0)
-      ROOT reshape = f32[4, 12, 19] reshape(p0)
-    }
-  )"));
-
-  EXPECT_EQ(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      std::nullopt);
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughElementwiseOp) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[150] parameter(0)
-      p1 = f32[150] parameter(1)
-      ROOT add = f32[150] add(p0, p1)
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (s0)",
-                                 "()[s0, s1, s2] -> (s1)",
-                                 "()[s0, s1, s2] -> (s2)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[150] parameter(0)
-      ROOT broadcast = f32[157,150] broadcast(p0), dimensions={1}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s4)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s5)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    max {
-      p0 = f32[] parameter(0)
-      p1 = f32[] parameter(1)
-      ROOT max = f32[] maximum(p0, p1)
-    }
-
-    ENTRY e {
-      p0 = f32[125,150] parameter(0)
-      c0 = f32[] constant(-inf)
-      ROOT reduce = f32[150] reduce(p0, c0), dimensions={0}, to_apply=max
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, s0)",
-                                 "()[s0, s1, s2] -> (125, s1)",
-                                 "()[s0, s1, s2] -> (1, s2)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[179] parameter(0)
-      ROOT reverse = f32[179] reverse(p0), dimensions={0}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2] -> (-s0 + 178)",
-                                 "()[s0, s1, s2] -> (s1)",
-                                 "()[s0, s1, s2] -> (-s2)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[120,142] parameter(0)
-      ROOT slice = f32[10,21] slice(p0), slice={[40:60:2], [20:104:4]}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5] -> (s0 * 2 + 40, s3 * 4 + 20)",
-          "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
-          "()[s0, s1, s2, s3, s4, s5] -> (s2 * 2, s5 * 4)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[21,10] parameter(0)
-      ROOT transpose = f32[10,21] transpose(p0), dimensions={1,0}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3, s0)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s4, s1)",
-                                 "()[s0, s1, s2, s3, s4, s5] -> (s5, s2)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
-  // TODO(325488844): Add additional concat test cases with constraints.
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[2,5,7] parameter(0)
-      p1 = f32[2,11,7] parameter(1)
-      p2 = f32[2,17,7] parameter(2)
-      ROOT concat = f32[2,33,7] concatenate(p0, p1, p2), dimensions={1}
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 5, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[2].begin()),
-      Optional(MatchSymbolicTile(
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 16, s6)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
-          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
-}
-
-TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
-  // TODO(325488844): Add pad tests with defined constraints on tile input.
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[4, 4] parameter(0)
-      p1 = f32[] parameter(1)
-      ROOT pad = f32[8,8] pad(p0, p1), padding=2_2_0x1_3_0
-    }
-  )"));
-
-  EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
-      Optional(
-          MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s0 - 2, s3 - 1)",
-                            "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
-                            "()[s0, s1, s2, s3, s4, s5] -> (s2, s5)")));
-}
-
-TEST_F(SymbolicTileTest, CanPrintSymbolicTileWithNamedTriplets) {
-  auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
-    HloModule m
-    ENTRY e {
-      p0 = f32[17, 19] parameter(0)
-      p1 = f32[19, 23] parameter(1)
-      ROOT dot = f32[17, 23] dot(p0, p1),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    }
-  )"));
-
-  std::string s;
-  std::stringstream ss(s);
-
-  SymbolicTile first_operand_tile =
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin())
-          .value();
-  SymbolicTile second_operand_tile =
-      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin())
-          .value();
-
-  ss << first_operand_tile;
-  EXPECT_THAT(
-      ss.str(),
-      AllOf(HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (offset0, 0)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (size0, 19)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (stride0, 1)")));
-
-  // Clear the stream and load the second map.
-  ss.str("");
-  ss << second_operand_tile;
-  EXPECT_THAT(
-      ss.str(),
-      AllOf(HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (0, offset1)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (19, size1)"),
-            HasSubstr("()[offset0, size0, stride0, offset1, size1, stride1] "
-                      "-> (1, stride1)")));
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users.cc b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
index b6996499f264b0..51ffbed0ec0138 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -70,6 +71,26 @@ class MoveCopyToUsersVisitor : public DfsHloRewriteVisitor {
     return absl::OkStatus();
   }
 
+  // Turn copy->dynamic-slice into dynamic-slice->copy, as dynamic-slice is
+  // layout-preserving.
+  absl::Status HandleDynamicSlice(HloInstruction* hlo) override {
+    HloInstruction* operand = hlo->mutable_operand(0);
+    if (operand->opcode() == HloOpcode::kCopy) {
+      HloInstruction* copied = operand->mutable_operand(0);
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * earlier_slice,
+          MakeDynamicSliceHlo(
+              copied,
+              absl::Span<HloInstruction* const>(hlo->operands()).subspan(1),
+              hlo->dynamic_slice_sizes(), &hlo->metadata()));
+      *earlier_slice->mutable_shape()->mutable_layout() =
+          copied->shape().layout();
+      HloInstruction* later_copy = MakeCopyHlo(earlier_slice, hlo->shape());
+      TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, later_copy));
+    }
+    return absl::OkStatus();
+  }
+
   // Turn copy->reduce_window into reduce_window->copy, as reduce_window is
   // layout-preserving.
   absl::Status HandleReduceWindow(HloInstruction* hlo) override {
diff --git a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
index 72f45b806fc9d5..718847168f901f 100644
--- a/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
+++ b/third_party/xla/xla/service/gpu/move_copy_to_users_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/strings/string_view.h"
+#include "xla/service/layout_assignment.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
 
@@ -26,6 +27,10 @@ namespace {
 
 class MoveCopyToUsersTest : public HloTestBase {
  public:
+  MoveCopyToUsersTest()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true,
+                    LayoutAssignment::InstructionCanChangeLayout) {}
   void CheckMoveCopyToUsers(absl::string_view hlo,
                             std::optional<absl::string_view> expected) {
     RunAndFilecheckHloRewrite(hlo, MoveCopyToUsers{}, expected);
@@ -112,13 +117,34 @@ HloModule module
 ENTRY main {
   input = f32[1,17,9,9]{3,2,1,0} parameter(0)
   copy = f32[1,17,9,9]{1,3,2,0} copy(input)
-  ROOT converted = f32[1,4,6,6] slice(copy), slice={[0:1],[0:4],[0:6],[0:6]}
+  ROOT slice = f32[1,4,6,6]{1,3,2,0} slice(copy), slice={[0:1],[0:4],[0:6],[0:6]}
 }
 )";
 
   CheckMoveCopyToUsers(hlo, R"(
 // CHECK: [[slice_0:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} slice([[input_1:%[^ ]+]]), slice={[0:1], [0:4], [0:6], [0:6]}
-// CHECK-NEXT: ROOT [[copy_1_2:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} copy([[slice_0]])
+// CHECK-NEXT: ROOT [[copy_1_2:%[^ ]+]] = f32[1,4,6,6]{1,3,2,0} copy([[slice_0]])
+)");
+}
+
+TEST_F(MoveCopyToUsersTest, DynamicSlice) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  input = f32[1,17,9,9]{3,2,1,0} parameter(0)
+  copy = f32[1,17,9,9]{1,3,2,0} copy(input)
+  s0 = s32[] parameter(1)
+  s1 = s32[] parameter(2)
+  s2 = s32[] parameter(3)
+  s3 = s32[] parameter(4)
+  ROOT ds = f32[1,4,6,6]{1,3,2,0} dynamic-slice(copy, s0, s1, s2, s3), dynamic_slice_sizes={1,4,6,6}
+}
+)";
+
+  CheckMoveCopyToUsers(hlo, R"(
+// CHECK: [[ds:%[^ ]+]] = f32[1,4,6,6]{3,2,1,0} dynamic-slice({{.*}}), dynamic_slice_sizes={1,4,6,6}
+// CHECK-NEXT: ROOT {{.*}} = f32[1,4,6,6]{1,3,2,0} copy([[ds]])
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
index a36f38bafcc3fb..c28c953a58bdb8 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
@@ -19,8 +19,6 @@ limitations under the License.
 #include <optional>
 #include <vector>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 299e5ca02f4391..8620bbbb8c0dbb 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <fstream>
 #include <iterator>
 #include <memory>
-#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -101,6 +100,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/env_var.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
@@ -110,7 +110,6 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace gpu {
@@ -267,8 +266,10 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   }
 
   HloPassPipeline pre_pipeline("nvptx post-layout_assignment part 1");
-  // Rewrite normalization patterns into cuDNN Custom Calls.
-  pre_pipeline.AddPass<CudnnNormRewriter>(cuda_compute_capability);
+  if (hlo_module->config().debug_options().xla_gpu_enable_cudnn_layer_norm()) {
+    // Rewrite normalization patterns into cuDNN Custom Calls.
+    pre_pipeline.AddPass<CudnnNormRewriter>(cuda_compute_capability);
+  }
 
   pre_pipeline.AddPass<DotDimensionMerger>();
 
@@ -324,7 +325,6 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
     pipeline->AddPass<GpuConvAlgorithmPicker>(autotune_config);
   }
   pipeline->AddPass<GemmAlgorithmPicker>(autotune_config);
-  pipeline->AddPass<CuDnnFusionCompiler>(autotune_config);
   return absl::OkStatus();
 }
 
@@ -342,6 +342,14 @@ absl::Status NVPTXCompiler::AddCustomKernelReplacementPasses(
   }
   return absl::OkStatus();
 }
+
+absl::Status NVPTXCompiler::RunCudnnFusionCompilerPass(
+    HloModule* module, se::StreamExecutor* stream_exec,
+    Thunk::BinaryMap* dnn_compiled_graphs) {
+  CuDnnFusionCompiler cudnn_compiler(*stream_exec, *dnn_compiled_graphs);
+  return cudnn_compiler.Run(module).status();
+}
+
 namespace {
 // Try to load ptx from files defined in the FLAGS. If successful, return true.
 bool MaybeLoadPtxFromFile(const HloModuleConfig module_config,
@@ -672,8 +680,7 @@ NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
   return cache_value->maybe_cubin;
 }
 
-static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
-    const std::string& preferred_cuda_dir) {
+static bool IsNvlinkEnabled() {
   const bool use_nvlink_by_default =
 #ifdef TF_DISABLE_NVLINK_BY_DEFAULT
       false;
@@ -684,24 +691,7 @@ static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
   TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_USE_NVLINK_FOR_PARALLEL_COMPILATION",
                                       /*default_val=*/
                                       use_nvlink_by_default, &use_nvlink));
-
-  if (!use_nvlink) {
-    return std::nullopt;
-  }
-
-  // Make sure nvlink exists and is executable.
-  absl::StatusOr<std::string> bin_path =
-      se::FindCudaExecutable("nvlink", preferred_cuda_dir);
-
-  if (!bin_path.ok()) {
-    return std::nullopt;
-  }
-
-  auto version = se::GetToolVersion(bin_path.value());
-  if (!version.ok()) {
-    return std::nullopt;
-  }
-  return *version;
+  return use_nvlink;
 }
 
 absl::StatusOr<NVPTXCompiler::LinkingMethod> ChooseLinkingMethodImpl(
@@ -710,16 +700,9 @@ absl::StatusOr<NVPTXCompiler::LinkingMethod> ChooseLinkingMethodImpl(
   TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
                       se::GetAsmCompilerVersion(preferred_cuda_dir));
 
-  // ptxas versions prior to 11.8 are not supported anymore. We check this here,
-  // since we are fetching the ptxas version anyway. Catching the error
-  // elsewhere might introduce unnecessary overhead.
-  if (ptxas_version_tuple < std::array<int64_t, 3>{11, 8, 0}) {
-    return absl::InternalError("XLA requires ptxas version 11.8 or higher");
-  }
-
-  std::optional<std::array<int64_t, 3>> nvlink_version =
-      GetNvLinkVersion(preferred_cuda_dir);
-  if (nvlink_version && *nvlink_version >= ptxas_version_tuple) {
+  auto nvlink_version = stream_executor::GetNvLinkVersion(preferred_cuda_dir);
+  if (IsNvlinkEnabled() && nvlink_version.ok() &&
+      nvlink_version.value() >= ptxas_version_tuple) {
     return LinkingMethod::kNvLink;
   }
 
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index a0ca2cf7c99b2c..3d7a770282b134 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -79,6 +79,10 @@ class NVPTXCompiler : public GpuCompiler {
   absl::Status AddCustomKernelReplacementPasses(
       HloPassPipeline* pipeline, const DebugOptions& debug_options) override;
 
+  absl::Status RunCudnnFusionCompilerPass(
+      HloModule* module, se::StreamExecutor* stream_exec,
+      Thunk::BinaryMap* dnn_compiled_graphs) override;
+
   HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() const override;
 
   absl::StatusOr<BackendCompileResult> CompileTargetBinary(
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index e6ab31b84033cf..f247582d38a5a3 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/backend.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -132,7 +133,7 @@ ENTRY entry {
 
 TEST_F(NVPTXCompilerTestTriton,
        DotDimensionAreSortedBeforePaddingForCublasEnablingTritonFusion) {
-  MatchOptimizedHlo(R"(
+  const absl::string_view hlo_string = R"(
 ENTRY e {
  p0 = f16[11,22,33,44] parameter(0)
  p1 = s8[11,22,33,44] parameter(1)
@@ -140,13 +141,25 @@ ENTRY e {
  ROOT d = f16[11,22,44,44] dot(p0, p1c),
   lhs_batch_dims={0,1}, lhs_contracting_dims={2},
   rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-})",
-                    R"(
+})";
+
+  se::CudaComputeCapability cc = backend()
+                                     .default_stream_executor()
+                                     ->GetDeviceDescription()
+                                     .cuda_compute_capability();
+
+  if (cc.IsAtLeastAmpere()) {
+    MatchOptimizedHlo(hlo_string, R"(
 ; CHECK: ENTRY
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: parameter
 ; CHECK-NEXT: __triton_gemm
-  )");
+    )");
+  } else {
+    MatchOptimizedHlo(hlo_string, R"(
+; CHECK-NOT: triton
+    )");
+  }
 }
 
 TEST_F(NVPTXCompilerTest, RemovesUnnecessaryCopyInPostSchedulingPipelines) {
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 21b353bf6850e5..c96c65319b259e 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -154,12 +154,36 @@ class GpuPriorityFusionQueue {
       }
       instructions.push_back(instruction);
     }
+
+    ComputeAndSetPriorities(instructions);
+  }
+
+  void ComputeAndSetPriorities(
+      const std::vector<HloInstruction*>& instructions) {
     std::vector<Priority> priorities = ComputePriorities(instructions);
 
     for (auto [instruction, priority] : llvm::zip(instructions, priorities)) {
-      auto emplace_result = producer_priority_queue_.emplace(
-          std::make_pair(priority, instruction->unique_id()), instruction);
-      CHECK(emplace_result.second);
+      auto key = std::make_pair(priority, instruction->unique_id());
+
+      // Remove instruction with the old priority from the queue.
+      auto reverse_it = reverse_map_.find(instruction);
+      if (reverse_it != reverse_map_.end()) {
+        const PriorityQueue::iterator& queue_it = reverse_it->second;
+        // Priority didn't change. Nothing to do.
+        if (key == queue_it->first) {
+          continue;
+        }
+        producer_priority_queue_.erase(queue_it);
+        reverse_map_.erase(reverse_it);
+      }
+
+      // If the priority is negative, it's not helpful to perform fusion on this
+      // instruction.
+      if (priority < 0) {
+        continue;
+      }
+
+      auto emplace_result = producer_priority_queue_.emplace(key, instruction);
       reverse_map_.emplace(instruction, emplace_result.first);
     }
   }
@@ -195,18 +219,11 @@ class GpuPriorityFusionQueue {
 
     while (!producer_priority_queue_.empty() && current_consumers_.empty()) {
       auto next_it = std::prev(producer_priority_queue_.end());
-      auto priority = next_it->first.first;
 
       current_producer_ = next_it->second;
       producer_priority_queue_.erase(next_it);
       reverse_map_.erase(current_producer_);
 
-      // If the priority is negative, it's not helpful to perform fusion on this
-      // instruction.
-      if (priority < 0) {
-        continue;
-      }
-
       current_consumers_ = current_producer_->users();
 
       if (current_producer_->opcode() == HloOpcode::kBitcast) {
@@ -229,30 +246,9 @@ class GpuPriorityFusionQueue {
       TF_CHECK_OK(cost_analysis_.RevisitInstruction(instruction));
     }
 
-    std::vector<HloInstruction*> to_update_vector{to_update_priority_.begin(),
-                                                  to_update_priority_.end()};
-    std::vector<Priority> new_priorities = ComputePriorities(to_update_vector);
+    ComputeAndSetPriorities(std::vector<HloInstruction*>{
+        to_update_priority_.begin(), to_update_priority_.end()});
 
-    for (auto [instruction, new_priority] :
-         llvm::zip(to_update_vector, new_priorities)) {
-      auto reverse_it = reverse_map_.find(instruction);
-      const auto new_key =
-          std::make_pair(new_priority, instruction->unique_id());
-      if (reverse_it != reverse_map_.end()) {
-        if (new_key == reverse_it->second->first) {
-          continue;
-        }
-        producer_priority_queue_.erase(reverse_it->second);
-      }
-      auto emplace_result =
-          producer_priority_queue_.emplace(new_key, instruction);
-      CHECK(emplace_result.second);
-      if (reverse_it != reverse_map_.end()) {
-        reverse_it->second = emplace_result.first;
-      } else {
-        reverse_map_.emplace(instruction, emplace_result.first);
-      }
-    }
     to_update_priority_.clear();
   }
 
@@ -669,7 +665,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   // With this modification it will be easier to match instructions before and
   // after fusion passes, because they will have the same unique prefix. Names
   // are not used in the pipeline, but it makes debugging much easier.
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     for (auto* instruction : computation->instructions()) {
       module->SetAndUniquifyInstrName(instruction,
                                       absl::StrCat(instruction->name(), ".0"));
@@ -682,9 +679,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   }
 
   int changed = false;
-  // Note: `GetFusionComputations` doesn't return the fusion computations, but
-  // the computations to be fused.
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     CHECK(!computation->IsFusionComputation());
 
     auto fusion_queue = std::make_unique<GpuPriorityFusionQueue>(
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 2ab4122f1ab2b7..411262a925a85c 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,8 +1,9 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_tsl//tsl:tsl.bzl", "nvtx_headers")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
 
@@ -39,7 +40,7 @@ cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/profiler/lib:nvtx_utils",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
-    ],
+    ] + if_cuda_is_configured(nvtx_headers()),
 )
 
 #===-------------------------------------------------------------------------------------------===//
@@ -72,11 +73,11 @@ cc_library(
     ]),
     deps = [
         ":annotation",
-        ":cudnn_thunk",
         ":custom_call_thunk",
         ":nccl_all_gather_thunk",
         ":nccl_all_reduce_thunk",
         ":nccl_collective_broadcast_thunk",
+        ":nccl_collective_thunk",
         "//xla:executable_run_options",
         "//xla:status",
         "//xla:types",
@@ -97,10 +98,9 @@ cc_library(
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -146,8 +146,7 @@ cc_library(
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -170,7 +169,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -203,7 +202,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
@@ -244,7 +243,7 @@ xla_test(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -270,7 +269,7 @@ cc_library(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:cusolver_context",
         "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -300,7 +299,7 @@ cc_library(
         "//xla:statusor",
         "//xla/service:buffer_assignment",  # build_cleaner: keep
         "//xla/service/gpu:buffer_allocations",  # build_cleaner: keep
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -343,7 +342,7 @@ xla_test(
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
@@ -372,8 +371,8 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu:variant_visitor",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:memory_allocation",
         "@com_google_absl//absl/base:core_headers",
@@ -399,7 +398,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_conv_runner",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -417,7 +416,7 @@ cc_library(
     deps = [
         "//xla:status",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
@@ -439,7 +438,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "//xla:shape_util",
         "//xla:util",
@@ -468,14 +467,18 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
         "//xla/service:executable",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
 
@@ -491,7 +494,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -515,7 +518,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_fused_mha_runner",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -532,7 +535,7 @@ cc_library(
         "//xla:status",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:logging",
@@ -552,7 +555,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla:status",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor",
@@ -571,7 +574,7 @@ cc_library(
         "//xla:util",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -589,8 +592,8 @@ cc_library(
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
         "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -613,7 +616,7 @@ cc_library(
     deps = [
         "//xla:status",
         "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -624,13 +627,13 @@ cc_library(
     srcs = ["nccl_all_gather_thunk.cc"],
     hdrs = ["nccl_all_gather_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -646,14 +649,14 @@ cc_library(
     srcs = ["nccl_all_reduce_thunk.cc"],
     hdrs = ["nccl_all_reduce_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -670,14 +673,14 @@ cc_library(
     srcs = ["nccl_all_to_all_thunk.cc"],
     hdrs = ["nccl_all_to_all_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -693,13 +696,13 @@ cc_library(
     srcs = ["nccl_collective_broadcast_thunk.cc"],
     hdrs = ["nccl_collective_broadcast_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:status",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
@@ -712,6 +715,7 @@ cc_library(
     srcs = ["nccl_collective_permute_thunk.cc"],
     hdrs = ["nccl_collective_permute_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -721,8 +725,7 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "@com_google_absl//absl/algorithm:container",
@@ -735,11 +738,66 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nccl_collective_thunk",
+    srcs = ["nccl_collective_thunk.cc"],
+    hdrs = ["nccl_collective_thunk.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW=1",
+    ]),
+    deps = [
+        "//xla:debug_options_flags",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer",
+        "//xla/service:global_device_id",
+        "//xla/service:rendezvous",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_clique",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/service/gpu/runtime:thunk",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/stream_executor",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/translate/mhlo_to_hlo:attribute_exporter",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_cuda_is_configured([
+        "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
+    ]),
+)
+
 cc_library(
     name = "nccl_p2p_thunk_common",
     srcs = ["nccl_p2p_thunk_common.cc"],
     hdrs = ["nccl_p2p_thunk_common.h"],
     deps = [
+        ":nccl_collective_thunk",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
@@ -747,7 +805,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:hlo_parser",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/base:core_headers",
@@ -766,6 +823,7 @@ cc_library(
     srcs = ["nccl_recv_thunk.cc"],
     hdrs = ["nccl_recv_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -774,8 +832,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -791,6 +848,7 @@ cc_library(
     srcs = ["nccl_send_thunk.cc"],
     hdrs = ["nccl_send_thunk.h"],
     deps = [
+        ":nccl_collective_thunk",
         ":nccl_p2p_thunk_common",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
@@ -799,8 +857,7 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service/gpu:nccl_api",
         "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -820,7 +877,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_norm_runner",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -836,7 +893,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
     ],
@@ -849,7 +906,7 @@ cc_library(
     deps = [
         "//xla/service:buffer_assignment",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -867,7 +924,7 @@ cc_library(
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -886,7 +943,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:buffer_assignment",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -901,6 +958,39 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "thunk",
+    srcs = ["thunk.cc"],
+    hdrs = ["thunk.h"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:executable",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_clique",
+        "//xla/service/gpu:nccl_clique_key",
+        "//xla/stream_executor",
+        "//xla/translate/mhlo_to_hlo:location_exporter",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/gtl:int_type",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "triangular_solve_thunk",
     srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
@@ -915,7 +1005,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_asm_opts",
@@ -935,7 +1025,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -954,7 +1044,7 @@ cc_library(
     hdrs = ["wait_for_streams_thunk.h"],
     deps = [
         "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
@@ -967,15 +1057,13 @@ cc_library(
     srcs = ["cudnn_thunk.cc"],
     hdrs = ["cudnn_thunk.h"],
     deps = [
-        "//xla:status",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:kernel_arguments",
-        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/runtime:thunk",
         "//xla/stream_executor",
-        "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:errors",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
index 07ae9ac30f67c8..b24a4f2b7cc3b7 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/address_computation_thunk.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -29,7 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -44,35 +45,48 @@ namespace gpu {
 
 AddressComputationThunk::AddressComputationThunk(
     ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-    std::vector<std::optional<const BufferAllocation::Slice>> operands,
-    std::vector<std::optional<const BufferAllocation::Slice>> results,
-    std::vector<std::optional<const BufferAllocation::Slice>>
+    std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+    std::vector<std::unique_ptr<BufferAllocation>> fake_allocations,
+    std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
         offset_buffer_indices,
-    std::vector<std::optional<const Shape>> orig_shapes,
-    std::vector<std::optional<const Shape>> sliced_shapes)
+    std::vector<std::optional<Shape>> orig_shapes,
+    std::vector<std::optional<Shape>> sliced_shapes,
+    std::vector<std::optional<uint64_t>> offset_byte_sizes)
     : Thunk(Kind::kAddressComputation, thunk_info),
       embedded_thunk_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*embedded_thunk))),
-      embedded_thunk_operands_(std::move(operands)),
-      embedded_thunk_results_(std::move(results)),
+      embedded_thunk_arguments_(std::move(arguments)),
+      fake_allocations_(std::move(fake_allocations)),
       offset_buffer_indices_(std::move(offset_buffer_indices)),
       orig_shapes_(std::move(orig_shapes)),
-      sliced_shapes_(std::move(sliced_shapes)) {}
+      sliced_shapes_(std::move(sliced_shapes)),
+      offset_byte_sizes_(std::move(offset_byte_sizes)) {}
 
 absl::Status AddressComputationThunk::Prepare(
     const PrepareParams& params, ResourceRequests& resource_requests) {
-  auto num_operands = embedded_thunk_operands_.size();
-  TF_RET_CHECK(num_operands == offset_buffer_indices_.size());
-  TF_RET_CHECK(num_operands == orig_shapes_.size());
-  TF_RET_CHECK(num_operands == sliced_shapes_.size());
-  for (unsigned i = 0; i < num_operands; ++i) {
-    if (sliced_shapes_[i].has_value()) {
-      TF_RET_CHECK(embedded_thunk_operands_[i].has_value());
-      TF_RET_CHECK(offset_buffer_indices_[i].has_value());
-      TF_RET_CHECK(sliced_shapes_[i]->IsArray());
-      TF_RET_CHECK(orig_shapes_[i].has_value() && orig_shapes_[i]->IsArray());
+  auto num_arguments = embedded_thunk_arguments_.size();
+  TF_RET_CHECK(num_arguments == offset_buffer_indices_.size());
+  TF_RET_CHECK(num_arguments == orig_shapes_.size());
+  TF_RET_CHECK(num_arguments == sliced_shapes_.size());
+  TF_RET_CHECK(num_arguments == offset_byte_sizes_.size());
+  for (auto [argument, offset_slice, orig_shape, sliced_shape,
+             offset_byte_size] :
+       llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
+                 orig_shapes_, sliced_shapes_, offset_byte_sizes_)) {
+    if (offset_slice.has_value()) {
+      TF_RET_CHECK(argument.has_value());
+      TF_RET_CHECK(orig_shape.has_value());
+      TF_RET_CHECK(sliced_shape.has_value());
+      TF_RET_CHECK(offset_byte_size.has_value());
+
+      TF_RET_CHECK(orig_shape->IsArray());
+      TF_RET_CHECK(sliced_shape->IsArray());
+
+      TF_RET_CHECK(offset_slice->size() == orig_shape->rank());
+      TF_RET_CHECK(sliced_shape->rank() == orig_shape->rank());
     }
   }
+
   TF_RETURN_IF_ERROR(embedded_thunk_->Prepare(params, resource_requests));
   return absl::OkStatus();
 }
@@ -81,15 +95,16 @@ absl::Status AddressComputationThunk::Initialize(
     const InitializeParams& params) {
   TF_RETURN_IF_ERROR(embedded_thunk_->Initialize(params));
 
-  unsigned num_offsets = 0;
+  unsigned offset_count = 0;
   for (auto maybe_shape : sliced_shapes_) {
-    num_offsets += (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
+    offset_count += (maybe_shape == std::nullopt) ? 1 : maybe_shape->rank();
   }
+
   absl::MutexLock lock(&mutex_);
   if (auto it = offsets_.find(params.executor); it == offsets_.end()) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::MemoryAllocation> allocation,
-        params.executor->HostMemoryAllocate(num_offsets * sizeof(int64_t)));
+        params.executor->HostMemoryAllocate(offset_count * sizeof(int64_t)));
     offsets_.emplace(params.executor, std::move(allocation));
   }
 
@@ -99,6 +114,9 @@ absl::Status AddressComputationThunk::Initialize(
 absl::Status AddressComputationThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& stream = *params.stream;
+  const BufferAllocations& orig_allocations = *params.buffer_allocations;
+  std::vector<se::DeviceMemoryBase> new_buffers(
+      embedded_thunk_arguments_.size(), se::DeviceMemoryBase());
 
   // Get memory allocation for copying offsets from device.
   int64_t* offsets_base = [&] {
@@ -106,64 +124,70 @@ absl::Status AddressComputationThunk::ExecuteOnStream(
     return reinterpret_cast<int64_t*>(offsets_.at(stream.parent())->opaque());
   }();
 
-  std::vector<se::DeviceMemoryBase> new_buffers;
-  const BufferAllocations& orig_allocations = *params.buffer_allocations;
-  for (unsigned i = 0; i < offset_buffer_indices_.size(); ++i) {
-    if (embedded_thunk_operands_[i] == std::nullopt) {
-      new_buffers.push_back(se::DeviceMemoryBase());
+  for (auto [argument_idx, values] : llvm::enumerate(
+           llvm::zip(embedded_thunk_arguments_, offset_buffer_indices_,
+                     orig_shapes_, sliced_shapes_, offset_byte_sizes_))) {
+    auto [argument_slice, offset_slice, orig_shape, sliced_shape,
+          offset_byte_size] = values;
+
+    if (argument_slice == std::nullopt) {
       continue;
     }
 
-    se::DeviceMemoryBase orig_operand =
-        orig_allocations.GetDeviceAddress(*embedded_thunk_operands_[i]);
-    if (offset_buffer_indices_[i] == std::nullopt) {
-      new_buffers.push_back(orig_operand);
+    // `orig_argument` will contain the original offset for slice
+    // `argument_slice` within `orig_allocations`
+    se::DeviceMemoryBase orig_argument =
+        orig_allocations.GetDeviceAddress(*argument_slice);
+
+    if (offset_slice == std::nullopt) {
+      new_buffers[argument_idx] = orig_argument;
       continue;
     }
 
-    se::DeviceMemoryBase offset_src =
-        orig_allocations.GetDeviceAddress(*offset_buffer_indices_[i]);
-
-    // Copy the ith offset from device to host.
-    const Shape& src_shape = *orig_shapes_[i];
-    const Shape& dst_shape = *sliced_shapes_[i];
-    int64_t* offset_dst = &offsets_base[i];
-    TF_RETURN_IF_ERROR(stream.Memcpy(offset_dst, offset_src,
-                                     dst_shape.rank() * sizeof(int64_t)));
+    const Shape& src_shape = *orig_shape;
+    const Shape& dst_shape = *sliced_shape;
+    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
 
-    if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
-      return absl::InternalError(absl::StrFormat(
-          "Failed to retrieve all slice offset values on stream %p: %s",
-          &stream, blocked.message()));
+    std::vector<int64_t> slice_starts;
+    slice_starts.reserve(dst_shape.rank());
+
+    // Get offset for `argument_idx`-th argument, which has `dst_shape.rank()`
+    // components.
+    for (auto [offset_idx, values] : llvm::enumerate(llvm::zip(
+             *offset_slice, src_shape.dimensions(), dst_shape.dimensions()))) {
+      auto [slice, src_dim, dst_dim] = values;
+      se::DeviceMemoryBase offset_src =
+          orig_allocations.GetDeviceAddress(slice);
+      int64_t* offset_dst = &offsets_base[argument_idx + offset_idx];
+      // Copy the `offset_idx`-th component of the offset for the
+      // `argument_idx`-th argument from device to host.
+      TF_RETURN_IF_ERROR(
+          stream.Memcpy(offset_dst, offset_src, offset_byte_size.value()));
+
+      if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to retrieve all slice offset values on stream %p: %s",
+            &stream, blocked.message()));
+      }
+      // Clamp start indices:
+      // start_indices[i] = min(max(start_indices[i], 0),
+      //                        operand.dimension_size[i] - size_indices[i])
+      auto start_index = std::min(std::max(*offset_dst, 0L), src_dim - dst_dim);
+      slice_starts.push_back(start_index);
     }
 
     // Compute new slice. No need to copy the content to new buffers as we can
     // reuse the original buffers since slices are contiguous.
-    TF_RET_CHECK(IsContiguousSlice(src_shape, dst_shape));
-
     int64_t new_size = ShapeUtil::ByteSizeOf(dst_shape);
-    BufferAllocation::Slice orig_slice = *embedded_thunk_operands_[i];
 
-    int64_t new_offset = orig_slice.offset();
-    std::vector<int64_t> slice_starts(offset_dst,
-                                      offset_dst + dst_shape.rank());
+    int64_t new_offset = 0;
     for (auto [start, stride] :
          llvm::zip(slice_starts, *ShapeUtil::ByteStrides(src_shape))) {
       new_offset += start * stride;
     }
 
-    new_buffers.push_back(orig_operand.GetByteSlice(new_offset, new_size));
-  }
-
-  // TODO(vuson): handle DUS too. For now just copy the results over.
-  for (auto result : embedded_thunk_results_) {
-    if (result == std::nullopt) {
-      new_buffers.push_back(se::DeviceMemoryBase());
-    } else {
-      se::DeviceMemoryBase orig_result =
-          orig_allocations.GetDeviceAddress(*result);
-      new_buffers.push_back(orig_result);
-    }
+    new_buffers[argument_idx] =
+        orig_argument.GetByteSlice(new_offset, new_size);
   }
 
   // Safe to create a local BufferAllocations here since buffers are only slices
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
index abb6d89ed1f59c..8d36751b9d830d 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -44,12 +44,13 @@ class AddressComputationThunk : public Thunk {
  public:
   AddressComputationThunk(
       ThunkInfo thunk_info, std::unique_ptr<ThunkSequence> embedded_thunk,
-      std::vector<std::optional<const BufferAllocation::Slice>> operands,
-      std::vector<std::optional<const BufferAllocation::Slice>> results,
-      std::vector<std::optional<const BufferAllocation::Slice>>
+      std::vector<std::optional<const BufferAllocation::Slice>> arguments,
+      std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_,
+      std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
           offset_buffer_indices,
-      std::vector<std::optional<const Shape>> orig_shapes,
-      std::vector<std::optional<const Shape>> sliced_shapes);
+      std::vector<std::optional<Shape>> orig_shapes,
+      std::vector<std::optional<Shape>> sliced_shapes,
+      std::vector<std::optional<uint64_t>> offset_byte_sizes);
 
   AddressComputationThunk(const AddressComputationThunk&) = delete;
   AddressComputationThunk& operator=(const AddressComputationThunk&) = delete;
@@ -62,14 +63,13 @@ class AddressComputationThunk : public Thunk {
  private:
   std::unique_ptr<SequentialThunk> embedded_thunk_;
   std::vector<std::optional<const BufferAllocation::Slice>>
-      embedded_thunk_operands_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
-      embedded_thunk_results_;
-  std::vector<std::optional<const BufferAllocation::Slice>>
+      embedded_thunk_arguments_;
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations_;
+  std::vector<std::optional<std::vector<BufferAllocation::Slice>>>
       offset_buffer_indices_;
-
-  std::vector<std::optional<const Shape>> orig_shapes_;
-  std::vector<std::optional<const Shape>> sliced_shapes_;
+  std::vector<std::optional<Shape>> orig_shapes_;
+  std::vector<std::optional<Shape>> sliced_shapes_;
+  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
 
   // Pinned host memory for transferring offset values from device to host.
   absl::Mutex mutex_;
diff --git a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
index 60c2f808677324..ee6b4eee6b6164 100644
--- a/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/address_computation_thunk_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape_util.h"
@@ -77,31 +77,46 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 3 * 1;
   int64_t out_length = sizeof(float) * 1 * 1;
-  int64_t lhs_offset_length = sizeof(int64_t) * 2;
+  int64_t offset_length = sizeof(int64_t);
 
   // Step 1:
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
-  BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
+                                    rhs_length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
-  BufferAllocation alloc_lhs_offset(/*index=*/4, lhs_offset_length,
-                                    /*color=*/0);
-  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0,
-                                           lhs_offset_length);
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, rhs_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, rhs_length);
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
 
   // Preparing config for GEMM thunk.
   auto config =
@@ -119,12 +134,19 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {slice_lhs_offset, std::nullopt},
-      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt},
-      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt});
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -155,15 +177,17 @@ TEST(AddressComputationThunkTest, SlicedGemm) {
       executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
 
-  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), lhs_offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset}, 0,
-                                executor->GetAllocator());
+  BufferAllocations allocations(
+      {lhs, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -192,36 +216,60 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 4 * 3;
   int64_t out_length = sizeof(float) * 2 * 2;
-  int64_t offset_length = sizeof(int64_t) * 2;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(float) * 2 * 2;
 
   // Step 1:
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_lhs(/*index=*/0, lhs_length, /*color=*/0);
   BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
 
   BufferAllocation alloc_rhs(/*index=*/1, rhs_length, /*color=*/0);
   BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
 
-  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
-  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
 
-  BufferAllocation alloc_workspace(/*index=*/3, 1024 * 1024, /*color=*/0);
-  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
 
-  BufferAllocation alloc_lhs_offset(/*index=*/4, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_offset(&alloc_lhs_offset, 0, offset_length);
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_rhs_offset(/*index=*/5, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_offset(&alloc_rhs_offset, 0, offset_length);
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_lhs_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_lhs_fake(&alloc_lhs_fake, 0, slice_length);
+  BufferAllocation alloc_rhs_offset_0(/*index=*/6, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_0(&alloc_rhs_offset_0, 0,
+                                             offset_length);
 
-  BufferAllocation alloc_rhs_fake(/*index=*/1, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_rhs_fake(&alloc_rhs_fake, 0, slice_length);
+  BufferAllocation alloc_rhs_offset_1(/*index=*/7, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
+                                             offset_length);
 
   // Preparing config for GEMM thunk.
   auto config =
@@ -239,14 +287,23 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       slice_out, slice_workspace, /*deterministic=*/true));
 
   // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
+                                                   slice_rhs_offset_1};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_lhs, slice_rhs},
-      {slice_out, slice_workspace}, {slice_lhs_offset, slice_rhs_offset},
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3})},
+       ShapeUtil::MakeShape(PrimitiveType::F32, {4, 3}), std::nullopt,
+       std::nullopt},
       {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}),
-       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2})});
+       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2}), std::nullopt,
+       std::nullopt},
+      {sizeof(int64_t), sizeof(int64_t), std::nullopt, std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -282,21 +339,23 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
       executor->AllocateArray<float>(1024 * 1024);
   TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
 
-  se::DeviceMemory<int64_t> lhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> lhs_offset_arr{0, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&lhs_offset, lhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
 
-  se::DeviceMemory<int64_t> rhs_offset = executor->AllocateArray<int64_t>(2);
+  se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> rhs_offset_arr{2, 1};
-  TF_ASSERT_OK(
-      stream.Memcpy(&rhs_offset, rhs_offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations(
-      {lhs, rhs, out, workspace, lhs_offset, rhs_offset}, 0,
-      executor->GetAllocator());
+  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
+                                 lhs_offset_1, rhs_offset_0, rhs_offset_1},
+                                0, executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -310,6 +369,178 @@ TEST(AddressComputationThunkTest, SlicedNonContiguousGemm) {
   ASSERT_FALSE(thunk.ExecuteOnStream(params).ok());
 }
 
+TEST(AddressComputationThunkTest, MulipleSlicedOperandsGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t length = sizeof(float) * 2 * 4;
+  int64_t out_length = sizeof(float) * 1;
+  int64_t offset_length = sizeof(int64_t);
+  int64_t slice_length = sizeof(float) * 3;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  BufferAllocation alloc_lhs(/*index=*/0, length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, length);
+
+  BufferAllocation alloc_rhs(/*index=*/1, length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_rhs_offset_0(/*index=*/6, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_0(&alloc_rhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_rhs_offset_1(/*index=*/7, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_rhs_offset_1(&alloc_rhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  std::vector<BufferAllocation::Slice> rhs_offsets{slice_rhs_offset_0,
+                                                   slice_rhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, rhs_offsets, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}),
+       ShapeUtil::MakeShape(PrimitiveType::F32, {8, 1}), std::nullopt,
+       std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
+       ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), std::nullopt,
+       std::nullopt},
+      {sizeof(int64_t), sizeof(int64_t), std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  std::vector<float> arr{1, 2, 3, 4, 5, 6, 7, 8};
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  TF_ASSERT_OK(stream.Memcpy(&lhs, arr.data(), length));
+
+  // Given a `rhs` tensor of shape f32[8,1]{1,0}
+  // The `rhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[3,1]{1,0} slice(rhs), slice={[2:5], [0:1]}
+  // rhs = [1.0,
+  //        2.0,
+  //        3.0,
+  //        4.0,
+  //        5.0,
+  //        6.0,
+  //        7.0,
+  //        8.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(8);
+  std::vector<float> rhs_arr(8, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, arr.data(), length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  se::DeviceMemory<int64_t> rhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> rhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> rhs_offset_arr{2, 0};
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_0, &rhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&rhs_offset_1, &rhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({lhs, rhs, out, workspace, lhs_offset_0,
+                                 lhs_offset_1, rhs_offset_0, rhs_offset_1},
+                                0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Execute address computation thunk and verify that it executed a GEMM on the
+  // right slices.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copy `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({2 * 3 + 3 * 4 + 4 * 5}));
+}
+
 static absl::Status Memcpy(se::Stream* stream, ffi::BufferBase src,
                            ffi::BufferBase dst) {
   return stream->MemcpyD2D(
@@ -337,30 +568,45 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   int64_t dst_count = 8 * 8;
   int64_t src_length = sizeof(int32_t) * src_count;
   int64_t dst_length = sizeof(int32_t) * dst_count;
-  int64_t offset_length = sizeof(int64_t) * 4;
+  int64_t offset_length = sizeof(int64_t);
   int64_t slice_length = sizeof(int32_t) * dst_count;
 
   // Step 1:
   // Prepare embedded and address computation thunks.
 
   // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
   BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
   BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
 
-  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
-  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, dst_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst(fake_allocations.back().get(), 0,
+                                    dst_length);
 
-  BufferAllocation alloc_offset(/*index=*/2, offset_length, /*color=*/0);
-  BufferAllocation::Slice slice_offset(&alloc_offset, 0, offset_length);
+  BufferAllocation alloc_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_0(&alloc_offset_0, 0, offset_length);
 
-  // Fake slices for embedded thunk creation.
-  BufferAllocation alloc_src_fake(/*index=*/0, slice_length, /*color=*/0);
-  BufferAllocation::Slice slice_src_fake(&alloc_src_fake, 0, slice_length);
+  BufferAllocation alloc_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_1(&alloc_offset_1, 0, offset_length);
+
+  BufferAllocation alloc_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_2(&alloc_offset_2, 0, offset_length);
+
+  BufferAllocation alloc_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_offset_3(&alloc_offset_3, 0, offset_length);
 
   // Preparing custom call thunk: setting up call target and operands + results
   // buffers.
-  auto handler = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
-  ASSERT_TRUE(handler.ok());
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
 
   std::vector<std::optional<CustomCallThunk::Slice>> operands{
       CustomCallThunk::Slice{slice_src_fake,
@@ -372,18 +618,22 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   // Creating embedded custom call thunk.
   ThunkSequence seq;
   seq.emplace_back(std::make_unique<CustomCallThunk>(
-      Thunk::ThunkInfo(nullptr), *handler, operands, results,
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
       /*attributes=*/CustomCallThunk::AttributesMap(),
       /*called_computation=*/nullptr));
 
   // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_offsets{
+      slice_offset_0, slice_offset_1, slice_offset_2, slice_offset_3};
   AddressComputationThunk thunk(
       Thunk::ThunkInfo(nullptr),
-      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src}, {slice_dst},
-      {slice_offset}, {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8})},
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      std::move(fake_allocations), {slice_offsets, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 8}), std::nullopt},
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
-      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8})});
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt},
+      {sizeof(int64_t), std::nullopt});
 
   // Step 2:
   // Execute address computation thunk.
@@ -402,14 +652,21 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
   TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
 
-  se::DeviceMemory<int64_t> offset = executor->AllocateArray<int64_t>(4);
+  se::DeviceMemory<int64_t> offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> offset_3 = executor->AllocateArray<int64_t>(1);
   std::vector<int64_t> offset_arr{3, 5, 2, 0};
-  TF_ASSERT_OK(stream.Memcpy(&offset, offset_arr.data(), offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_0, &offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_1, &offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_2, &offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&offset_3, &offset_arr[3], offset_length));
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
-  BufferAllocations allocations({src, dst, offset}, 0,
-                                executor->GetAllocator());
+  BufferAllocations allocations(
+      {src, dst, offset_0, offset_1, offset_2, offset_3}, 0,
+      executor->GetAllocator());
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
@@ -436,4 +693,974 @@ TEST(AddressComputationThunkTest, SlicedMemcpy) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(AddressComputationThunkTest, SlicedOutputMemcpy) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t src_count = 8 * 8 * 10 * 2;
+  int64_t dst_count = 2 * 2 * 2 * 2;
+  int64_t slice_count = 2 * 2;
+  int64_t src_length = sizeof(int32_t) * src_count;
+  int64_t dst_length = sizeof(int32_t) * dst_count;
+  int64_t offset_length = sizeof(int64_t);
+  int64_t slice_length = sizeof(int32_t) * slice_count;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
+  BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
+
+  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+
+  BufferAllocation alloc_src_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_0(&alloc_src_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_1(&alloc_src_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_2(&alloc_src_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_3(&alloc_src_offset_3, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_0(/*index=*/6, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_0(&alloc_dst_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_1(/*index=*/7, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_1(&alloc_dst_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_2(/*index=*/8, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_2(&alloc_dst_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_3(/*index=*/9, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
+                                             offset_length);
+
+  // Preparing custom call thunk: setting up call target and operands + results
+  // buffers.
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
+
+  std::vector<std::optional<CustomCallThunk::Slice>> operands{
+      CustomCallThunk::Slice{slice_src_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<std::optional<CustomCallThunk::Slice>> results{
+      CustomCallThunk::Slice{slice_dst_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+
+  // Creating embedded custom call thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<CustomCallThunk>(
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      /*attributes=*/CustomCallThunk::AttributesMap(),
+      /*called_computation=*/nullptr));
+
+  // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_src_offsets{
+      slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
+      slice_src_offset_3};
+  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+      slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
+      slice_dst_offset_3};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
+      // Make sure to pass a dst shape with the same rank as src shape (i.e.
+      // original slice result and not bitcasted one)
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
+      {sizeof(int64_t), sizeof(int64_t)});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `src` tensor of shape s32[8,8,10,2]{3,2,1,0}
+  // The `src` slice that we want to copy from will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(src), slice={[3:4], [5:6], [2:4], [0:2]}
+  //
+  // Given a `dst` tensor of shape s32[2,2,2,2]{3,2,1,0}
+  // The `dst` slice that we want to copy into will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(dst), slice={[1:2], [1:2], [0:2], [0:2]}
+
+  // Preparing memory for thunk arguments.
+  se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
+  std::vector<int32_t> src_arr(src_count, 0);
+  for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
+  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+
+  se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
+  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+
+  se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> src_offset_arr{3, 5, 2, 0};
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+
+  se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> dst_offset_arr{1, 1, 0, 0};
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
+       dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
+      0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `dst` data back to host for verification.
+  std::vector<int32_t> out(dst_count, 0);
+  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+
+  // Verifying that the right slice of `src` was copied to `dst`.
+  std::vector<int32_t> ref(dst_count, 0);
+  int64_t src_offset_val =
+      src_offset_arr[3] +
+      2 * (src_offset_arr[2] +
+           10 * (src_offset_arr[1] + 8 * src_offset_arr[0]));
+  int64_t dst_offset_val =
+      dst_offset_arr[3] +
+      2 * (dst_offset_arr[2] + 2 * (dst_offset_arr[1] + 2 * dst_offset_arr[0]));
+  std::copy(src_arr.begin() + src_offset_val,
+            src_arr.begin() + src_offset_val + slice_count,
+            ref.begin() + dst_offset_val);
+  ASSERT_EQ(out, ref);
+}
+
+TEST(AddressComputationThunkTest, SlicedGemmArbitraryArgumentOrder) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
+  BufferAllocation alloc_lhs(/*index=*/1, lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
+
+  BufferAllocation alloc_rhs(/*index=*/3, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/0, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {workspace, lhs, out, rhs, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
+TEST(AddressComputationThunkTest, SlicedGemmArbitraryNumberOfArguments) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
+  BufferAllocation alloc_lhs(/*index=*/7, lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, 0, lhs_length);
+
+  BufferAllocation alloc_rhs(/*index=*/3, rhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_rhs(&alloc_rhs, 0, rhs_length);
+
+  BufferAllocation alloc_out(/*index=*/2, out_length, /*color=*/0);
+  BufferAllocation::Slice slice_out(&alloc_out, 0, out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/0, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `lhs` tensor of shape f32[2,4]{1,0}
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+
+  // Preparing memory for thunk arguments.
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  se::DeviceMemory<float> lhs = executor->AllocateArray<float>(2 * 4);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {workspace, /*garbage, to be ignored*/ se::DeviceMemoryBase(), out, rhs,
+       lhs_offset_0, lhs_offset_1, /*garbage, to be ignored*/ rhs, lhs},
+      0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
+TEST(AddressComputationThunkTest, SlicedTupledOperandGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  BufferAllocation alloc_lhs(/*index=*/0, 3 * lhs_length, /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc_lhs, lhs_length, lhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs(fake_allocations.back().get(), 0,
+                                    rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out(fake_allocations.back().get(), 0,
+                                    out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace(fake_allocations.back().get(), 0,
+                                          1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/4, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/5, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs,
+      slice_out, slice_workspace, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+
+  // Preparing memory for thunk arguments.
+  // lhs = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+  //
+  // The real `lhs` tensor will look more like this:
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+  se::DeviceMemory<float> lhs_whole_buffer =
+      executor->AllocateArray<float>(2 * 4 * 3);
+  TF_ASSERT_OK(stream.MemZero(&lhs_whole_buffer, 2 * 4 * 3));
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  se::DeviceMemoryBase lhs =
+      lhs_whole_buffer.GetByteSlice(lhs_length, lhs_length);
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemory<float> rhs = executor->AllocateArray<float>(3 * 1);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemory<float> out = executor->AllocateArray<float>(1 * 1);
+  TF_ASSERT_OK(stream.MemZero(&out, out_length));
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {lhs_whole_buffer, rhs, out, workspace, lhs_offset_0, lhs_offset_1}, 0,
+      executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
+TEST(AddressComputationThunkTest, SlicedMemcpyOOB) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t src_count = 8 * 8 * 10 * 2;
+  int64_t dst_count = 2 * 2 * 2 * 2;
+  int64_t slice_count = 2 * 2;
+  int64_t src_length = sizeof(int32_t) * src_count;
+  int64_t dst_length = sizeof(int32_t) * dst_count;
+  int64_t offset_length = sizeof(int64_t);
+  int64_t slice_length = sizeof(int32_t) * slice_count;
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(2);
+
+  // Fake slices for embedded thunk creation.
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/0, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_src_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/1, slice_length, /*color=*/0));
+  BufferAllocation::Slice slice_dst_fake(fake_allocations.back().get(), 0,
+                                         slice_length);
+
+  BufferAllocation alloc_src(/*index=*/0, src_length, /*color=*/0);
+  BufferAllocation::Slice slice_src(&alloc_src, 0, src_length);
+
+  BufferAllocation alloc_dst(/*index=*/1, dst_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst(&alloc_dst, 0, dst_length);
+
+  BufferAllocation alloc_src_offset_0(/*index=*/2, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_0(&alloc_src_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_1(/*index=*/3, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_1(&alloc_src_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_2(/*index=*/4, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_2(&alloc_src_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_src_offset_3(/*index=*/5, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_src_offset_3(&alloc_src_offset_3, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_0(/*index=*/6, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_0(&alloc_dst_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_1(/*index=*/7, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_1(&alloc_dst_offset_1, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_2(/*index=*/8, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_2(&alloc_dst_offset_2, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_dst_offset_3(/*index=*/9, offset_length, /*color=*/0);
+  BufferAllocation::Slice slice_dst_offset_3(&alloc_dst_offset_3, 0,
+                                             offset_length);
+
+  // Preparing custom call thunk: setting up call target and operands + results
+  // buffers.
+  auto registration = xla::ffi::FindHandler("__xla_test$$memcpy", PLATFORM);
+  ASSERT_TRUE(registration.ok());
+
+  std::vector<std::optional<CustomCallThunk::Slice>> operands{
+      CustomCallThunk::Slice{slice_src_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+  std::vector<std::optional<CustomCallThunk::Slice>> results{
+      CustomCallThunk::Slice{slice_dst_fake,
+                             ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2})}};
+
+  // Creating embedded custom call thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<CustomCallThunk>(
+      Thunk::ThunkInfo(nullptr), registration->handler, operands, results,
+      /*attributes=*/CustomCallThunk::AttributesMap(),
+      /*called_computation=*/nullptr));
+
+  // Wrapping address computation thunk around the custom call thunk.
+  std::vector<BufferAllocation::Slice> slice_src_offsets{
+      slice_src_offset_0, slice_src_offset_1, slice_src_offset_2,
+      slice_src_offset_3};
+  std::vector<BufferAllocation::Slice> slice_dst_offsets{
+      slice_dst_offset_0, slice_dst_offset_1, slice_dst_offset_2,
+      slice_dst_offset_3};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)), {slice_src, slice_dst},
+      std::move(fake_allocations), {slice_src_offsets, slice_dst_offsets},
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {8, 8, 10, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {2, 2, 2, 2})},
+      // Make sure to pass a dst shape with the same rank as src shape (i.e.
+      // original slice result and not bitcasted one)
+      {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
+       ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
+      {sizeof(int64_t), sizeof(int64_t)});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+  // Given a `src` tensor of shape s32[8,8,10,2]{3,2,1,0}
+  // The `src` slice that we want to copy from will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(src), slice={[3:4], [5:6], [2:4], [0:2]}
+  //
+  // Given a `dst` tensor of shape s32[2,2,2,2]{3,2,1,0}
+  // The `dst` slice that we want to copy into will be equivalent to this static
+  // slice op:
+  // s32[1,1,2,2]{3,2,1,0} slice(dst), slice={[1:2], [1:2], [0:2], [0:2]}
+
+  // Preparing memory for thunk arguments.
+  se::DeviceMemory<int32_t> src = executor->AllocateArray<int32_t>(src_count);
+  std::vector<int32_t> src_arr(src_count, 0);
+  for (unsigned i = 0; i < src_count; ++i) src_arr[i] = i;
+  TF_ASSERT_OK(stream.Memcpy(&src, src_arr.data(), src_length));
+
+  se::DeviceMemory<int32_t> dst = executor->AllocateArray<int32_t>(dst_count);
+  TF_ASSERT_OK(stream.MemZero(&dst, dst_length));
+
+  se::DeviceMemory<int64_t> src_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> src_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> src_ref_offset_arr{3, 5, 2, 0};
+  std::vector<int64_t> src_offset_arr{3, 5, 2, -3};
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_0, &src_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_1, &src_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_2, &src_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&src_offset_3, &src_offset_arr[3], offset_length));
+
+  se::DeviceMemory<int64_t> dst_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_1 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_2 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> dst_offset_3 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> dst_ref_offset_arr{1, 1, 0, 0};
+  std::vector<int64_t> dst_offset_arr{3, 2, 5, -4};
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_0, &dst_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_1, &dst_offset_arr[1], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_2, &dst_offset_arr[2], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&dst_offset_3, &dst_offset_arr[3], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations(
+      {src, dst, src_offset_0, src_offset_1, src_offset_2, src_offset_3,
+       dst_offset_0, dst_offset_1, dst_offset_2, dst_offset_3},
+      0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `dst` data back to host for verification.
+  std::vector<int32_t> out(dst_count, 0);
+  TF_ASSERT_OK(stream.Memcpy(out.data(), dst, dst_length));
+
+  // Verifying that the right slice of `src` was copied to `dst`.
+  std::vector<int32_t> ref(dst_count, 0);
+  int64_t src_offset_val =
+      src_ref_offset_arr[3] +
+      2 * (src_ref_offset_arr[2] +
+           10 * (src_ref_offset_arr[1] + 8 * src_ref_offset_arr[0]));
+  int64_t dst_offset_val =
+      dst_ref_offset_arr[3] +
+      2 * (dst_ref_offset_arr[2] +
+           2 * (dst_ref_offset_arr[1] + 2 * dst_ref_offset_arr[0]));
+  std::copy(src_arr.begin() + src_offset_val,
+            src_arr.begin() + src_offset_val + slice_count,
+            ref.begin() + dst_offset_val);
+  ASSERT_EQ(out, ref);
+}
+
+TEST(AddressComputationThunkTest, SlicedOperandsSameBufferGemm) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  TF_ASSERT_OK(stream.Initialize());
+
+  int64_t lhs_length = sizeof(float) * 2 * 4;
+  int64_t rhs_length = sizeof(float) * 3 * 1;
+  int64_t out_length = sizeof(float) * 1 * 1;
+  int64_t offset_length = sizeof(int64_t);
+
+  // Step 1:
+  // Prepare embedded and address computation thunks.
+
+  // Preparing buffer allocation slices for thunk creations.
+  std::vector<std::unique_ptr<BufferAllocation>> fake_allocations(4);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/0, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_lhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/1, rhs_length, /*color=*/0));
+  BufferAllocation::Slice slice_rhs_fake(fake_allocations.back().get(), 0,
+                                         rhs_length);
+
+  fake_allocations.push_back(
+      std::make_unique<BufferAllocation>(/*index=*/2, out_length, /*color=*/0));
+  BufferAllocation::Slice slice_out_fake(fake_allocations.back().get(), 0,
+                                         out_length);
+
+  fake_allocations.push_back(std::make_unique<BufferAllocation>(
+      /*index=*/3, 1024 * 1024, /*color=*/0));
+  BufferAllocation::Slice slice_workspace_fake(fake_allocations.back().get(), 0,
+                                               1024 * 1024);
+
+  BufferAllocation alloc(/*index=*/0, lhs_length + rhs_length + out_length,
+                         /*color=*/0);
+  BufferAllocation::Slice slice_lhs(&alloc, 0, lhs_length);
+  BufferAllocation::Slice slice_rhs(&alloc, lhs_length, rhs_length);
+  BufferAllocation::Slice slice_out(&alloc, lhs_length + rhs_length,
+                                    out_length);
+
+  BufferAllocation alloc_workspace(/*index=*/1, 1024 * 1024, /*color=*/0);
+  BufferAllocation::Slice slice_workspace(&alloc_workspace, 0, 1024 * 1024);
+
+  BufferAllocation alloc_lhs_offset_0(/*index=*/2, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_0(&alloc_lhs_offset_0, 0,
+                                             offset_length);
+
+  BufferAllocation alloc_lhs_offset_1(/*index=*/3, offset_length,
+                                      /*color=*/0);
+  BufferAllocation::Slice slice_lhs_offset_1(&alloc_lhs_offset_1, 0,
+                                             offset_length);
+
+  // Preparing config for GEMM thunk.
+  auto config =
+      GemmConfig::For(ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), {}, {1},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), {}, {0},
+                      ShapeUtil::MakeShape(PrimitiveType::F32, {1, 1}), 1.0,
+                      0.0, 0.0, PrecisionConfig::ALG_UNSET, std::nullopt,
+                      se::blas::kDefaultComputePrecision, false, false);
+  ASSERT_TRUE(config.ok());
+
+  // Creating embedded GEMM thunk.
+  ThunkSequence seq;
+  seq.emplace_back(std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo(nullptr), config.value(), slice_lhs_fake, slice_rhs_fake,
+      slice_out_fake, slice_workspace_fake, /*deterministic=*/true));
+
+  // Wrapping address computation thunk around the GEMM thunk.
+  std::vector<BufferAllocation::Slice> lhs_offsets{slice_lhs_offset_0,
+                                                   slice_lhs_offset_1};
+  AddressComputationThunk thunk(
+      Thunk::ThunkInfo(nullptr),
+      std::make_unique<ThunkSequence>(std::move(seq)),
+      {slice_lhs, slice_rhs, slice_out, slice_workspace},
+      std::move(fake_allocations),
+      {lhs_offsets, std::nullopt, std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
+       std::nullopt, std::nullopt},
+      {sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt});
+
+  // Step 2:
+  // Execute address computation thunk.
+  //
+
+  // Preparing memory for thunk arguments.
+  // lhs = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+  //
+  // The real `lhs` tensor will look more like this:
+  // lhs = [1.0, 2.0, 3.0, 4.0,
+  //        5.0, 6.0, 7.0, 8.0]
+  // The `lhs` slice that we want to use will be equivalent to this static
+  // slice op:
+  // f32[1,3]{1,0} slice(lhs), slice={[0:1], [1:4]}
+  se::DeviceMemory<float> buffer =
+      executor->AllocateArray<float>(lhs_length + rhs_length + out_length);
+  TF_ASSERT_OK(stream.MemZero(&buffer, lhs_length + rhs_length + out_length));
+
+  se::DeviceMemoryBase lhs = buffer.GetByteSlice(0, lhs_length);
+  std::vector<float> lhs_arr{1, 2, 3, 4, 5, 6, 7, 8};
+  TF_ASSERT_OK(stream.Memcpy(&lhs, lhs_arr.data(), lhs_length));
+
+  // rhs = [1.0,
+  //        1.0,
+  //        1.0]
+  se::DeviceMemoryBase rhs = buffer.GetByteSlice(lhs_length, rhs_length);
+  std::vector<float> rhs_arr(3, 1);
+  TF_ASSERT_OK(stream.Memcpy(&rhs, rhs_arr.data(), rhs_length));
+
+  se::DeviceMemoryBase out =
+      buffer.GetByteSlice(lhs_length + rhs_length, out_length);
+
+  se::DeviceMemory<float> workspace =
+      executor->AllocateArray<float>(1024 * 1024);
+  TF_ASSERT_OK(stream.MemZero(&workspace, 1024 * 1024));
+
+  se::DeviceMemory<int64_t> lhs_offset_0 = executor->AllocateArray<int64_t>(1);
+  se::DeviceMemory<int64_t> lhs_offset_1 = executor->AllocateArray<int64_t>(1);
+  std::vector<int64_t> lhs_offset_arr{0, 1};
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_0, &lhs_offset_arr[0], offset_length));
+  TF_ASSERT_OK(stream.Memcpy(&lhs_offset_1, &lhs_offset_arr[1], offset_length));
+
+  // Preparing parameters for thunk execution.
+  ServiceExecutableRunOptions run_options;
+  BufferAllocations allocations({buffer, workspace, lhs_offset_0, lhs_offset_1},
+                                0, executor->GetAllocator());
+
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  TF_ASSERT_OK(
+      thunk.Initialize({executor, source, &allocations, &stream, &stream}));
+
+  // Executing address computation thunk.
+  TF_ASSERT_OK(thunk.ExecuteOnStream(params));
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  // Copying `out` data back to host for verification.
+  std::vector<float> dst(1, 0);
+  TF_ASSERT_OK(stream.Memcpy(dst.data(), out, out_length));
+
+  ASSERT_EQ(dst, std::vector<float>({9}));
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
index 39af816c1b29fe..809f02d30d10b3 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -42,29 +42,22 @@ limitations under the License.
 #include "tsl/profiler/lib/nvtx_utils.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
+#if GOOGLE_CUDA
+#include "nvtx3/nvToolsExt.h"
+#include "nvtx3/nvToolsExtPayload.h"
+#endif
+
 namespace xla::gpu {
 
 using ::tsl::profiler::ScopedAnnotation;
+using ::tsl::profiler::StringHandle;
 namespace {
 
-nvtxStringHandle_t RegisterString(const std::string& str) {
-#if GOOGLE_CUDA
-  auto domain = tsl::profiler::GetNVTXDomain();
-  if (!domain) {
-    return {};  // NVTX not enabled, so don't registering strings.
+StringHandle RegisterString(const std::string& str) {
+  if (auto domain = tsl::profiler::DefaultProfilerDomain(); domain) {
+    return tsl::profiler::RegisterString(domain, str);
   }
-  constexpr auto max_length = 65330;
-  if (str.size() <= max_length) {
-    return nvtxDomainRegisterStringA(*domain, str.c_str());
-  }
-  // nvbugs 4340868
-  std::string_view suffix{"\n[truncated]\n"};
-  std::string buffer(str.data(), max_length - suffix.size());
-  buffer.append(suffix);
-  return nvtxDomainRegisterStringA(*domain, buffer.c_str());
-#else
   return {};
-#endif
 }
 
 // Nsight Systems supports some basic HTML markup in annotation strings. This
@@ -202,7 +195,7 @@ class SourceLocationVisitor : public ConstDfsHloVisitorWithDefault {
     return OkStatus();
   }
 
-  std::pair<nvtxStringHandle_t, int32_t> LongestSourceLocationPrefix() const {
+  std::pair<StringHandle, int32_t> LongestSourceLocationPrefix() const {
     // Find the longest common prefix along the members of location_set_ and
     // return a formatted version of that prefix, along with its length. As
     // location_set_ is sorted, that just means looking for the longest common
@@ -371,7 +364,7 @@ std::string CalledInstructionsAsString(HloInstruction const& inst) {
 
 // Get a string representing the longest common prefix of source locations in
 // this module, and the number of frames that that represents.
-std::pair<nvtxStringHandle_t, int32_t> GetLongestSourceLocationPrefix(
+std::pair<StringHandle, int32_t> GetLongestSourceLocationPrefix(
     const HloModule& mod) {
   // In the presence of (at least) debug callbacks, calling Accept on the root
   // instruction of the module may not reach all instructions in the module.
@@ -417,8 +410,8 @@ auto schema_entry(uint64_t type, const char* name, uint64_t offset) {
 uint64_t ModuleAnnotation::NvtxSchemaId() {
   static std::uint64_t schema_id = []() -> std::uint64_t {
 #if GOOGLE_CUDA
-    auto domain_opt = tsl::profiler::GetNVTXDomain();
-    if (!domain_opt.has_value()) {
+    auto domain = tsl::profiler::DefaultProfilerDomain();
+    if (!domain) {
       return 0;
     }
     const nvtxPayloadSchemaEntry_t schema[] = {
@@ -440,7 +433,7 @@ uint64_t ModuleAnnotation::NvtxSchemaId() {
         /* .entries = */ schema,
         /* .numEntries = */ sizeof(schema) / sizeof(schema[0]),
         /* .payloadStaticSize = */ sizeof(ModuleAnnotation)};
-    return nvtxPayloadSchemaRegister(*domain_opt, &schemaAttr);
+    return RegisterSchema(domain, &schemaAttr);
 #else
     return 0;
 #endif
@@ -491,8 +484,8 @@ ModuleAnnotations::ModuleAnnotations(std::string_view module_name)
 uint64_t KernelAnnotation::NvtxSchemaId() {
   static std::uint64_t schema_id = []() -> std::uint64_t {
 #if GOOGLE_CUDA
-    auto domain_opt = tsl::profiler::GetNVTXDomain();
-    if (!domain_opt.has_value()) {
+    auto domain = tsl::profiler::DefaultProfilerDomain();
+    if (!domain) {
       return 0;
     }
     const nvtxPayloadSchemaEntry_t schema[] = {
@@ -515,7 +508,7 @@ uint64_t KernelAnnotation::NvtxSchemaId() {
         /* .entries = */ schema,
         /* .numEntries = */ sizeof(schema) / sizeof(schema[0]),
         /* .payloadStaticSize = */ sizeof(KernelAnnotation)};
-    return nvtxPayloadSchemaRegister(*domain_opt, &schemaAttr);
+    return RegisterSchema(domain, &schemaAttr);
 #else
     return 0;
 #endif
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.h b/third_party/xla/xla/service/gpu/runtime/annotation.h
index fc0dbfb5ed2a60..70a4c8df1ddeb1 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.h
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.h
@@ -38,21 +38,21 @@ class ModuleAnnotation {
 
   std::string_view longest_op_name_prefix() const { return longest_prefix_; }
   explicit operator std::string_view() const { return title_str_; }
-  nvtxStringHandle_t title() const { return title_; }
+  tsl::profiler::StringHandle title() const { return title_; }
   static uint64_t NvtxSchemaId();
   int32_t common_stack_frames() const { return common_stack_frames_; }
 
  private:
-  friend void RangePush(nvtxDomainHandle_t domain,
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
                         const ModuleAnnotation& annotation) {
     tsl::profiler::RangePush(domain, annotation.title(), annotation);
   }
 
   std::string longest_prefix_;
   std::string title_str_;
-  nvtxStringHandle_t title_;
-  nvtxStringHandle_t module_name_;
-  nvtxStringHandle_t common_src_locations_{};
+  tsl::profiler::StringHandle title_;
+  tsl::profiler::StringHandle module_name_;
+  tsl::profiler::StringHandle common_src_locations_{};
   int32_t module_id_{-1};
   int32_t common_stack_frames_{};
 };
@@ -66,16 +66,16 @@ struct KernelAnnotation {
   static uint64_t NvtxSchemaId();
 
  private:
-  friend void RangePush(nvtxDomainHandle_t domain,
+  friend void RangePush(tsl::profiler::ProfilerDomainHandle domain,
                         const KernelAnnotation& annotation) {
     tsl::profiler::RangePush(domain, annotation.title, annotation);
   }
 
   std::string title_str;
-  nvtxStringHandle_t title;
-  nvtxStringHandle_t hlo_dump;
-  nvtxStringHandle_t src_locations;
-  nvtxStringHandle_t called_hlo_dump;
+  tsl::profiler::StringHandle title;
+  tsl::profiler::StringHandle hlo_dump;
+  tsl::profiler::StringHandle src_locations;
+  tsl::profiler::StringHandle called_hlo_dump;
 };
 
 // Parsed/prepared information for an HloModule that gets propagated to NVTX
diff --git a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
index d2448223a82a09..3fdbf3ebc89f9d 100644
--- a/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index c1909dc98e437a..f8507cd4b205a0 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -51,16 +51,17 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
@@ -502,6 +503,7 @@ CommandBufferCmd::BufferUsageVector ComputationIdCmd::buffers() {
 
 absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
                                           StateManager& state) {
+#if defined(GOOGLE_CUDA)
   {
     absl::MutexLock lock(&mutex_);
     if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
@@ -514,6 +516,7 @@ absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
 
   absl::MutexLock lock(&mutex_);
   memset_kernels_.emplace(params.executor, std::move(kernel));
+#endif  // GOOGLE_CUDA
   return absl::OkStatus();
 }
 
@@ -540,6 +543,7 @@ absl::Status ComputationIdCmd::Record(
           << "; execution_scope_id=" << execution_scope_id.value();
   VLOG(5) << "  Id: " << dest_ << " (" << dst.opaque() << ")";
 
+#if defined(GOOGLE_CUDA)
   se::Kernel* memset_kernel = [&] {
     absl::MutexLock lock(&mutex_);
     return memset_kernels_[execute_params.stream->parent()].get();
@@ -553,6 +557,10 @@ absl::Status ComputationIdCmd::Record(
   auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
   return command_buffer->Launch(execution_scope_id, se::ThreadDim(1),
                                 se::BlockDim(1), *memset_kernel, *args);
+#else
+  return command_buffer->Memset(execution_scope_id, &dst, value,
+                                /*num_elements=*/1);
+#endif  // GOOGLE_CUDA
 }
 
 //===----------------------------------------------------------------------===//
@@ -1150,7 +1158,7 @@ CommandBufferCmd::BufferUsageVector GemmCmd::buffers() {
 
 CuDnnCmd::CuDnnCmd(ExecutionStreamId execution_stream_id,
                    absl::Span<const BufferAllocation::Slice> args,
-                   const se::dnn::DnnGraph& graph)
+                   const std::shared_ptr<se::dnn::LazyDnnGraph> graph)
     : TracedCommandBufferCmd(execution_stream_id),
       args_(args.cbegin(), args.cend()),
       graph_(graph) {}
@@ -1166,6 +1174,7 @@ absl::Status CuDnnCmd::Initialize(const Thunk::InitializeParams& params,
 absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
                               const RecordParams& record_params,
                               se::CommandBuffer* command_buffer) {
+  CHECK(graph_ != nullptr);
   std::vector<se::DeviceMemoryBase> operands;
   operands.reserve(args_.size());
   for (const BufferAllocation::Slice& arg : args_) {
@@ -1177,8 +1186,8 @@ absl::Status CuDnnCmd::Record(const Thunk::ExecuteParams& execute_params,
 
   return AddTracedCommandBuffer(
       execute_params, record_params, command_buffer, [&](se::Stream* stream) {
-        return graph_.Execute(*stream,
-                              absl::Span<se::DeviceMemoryBase>(operands));
+        return graph_->get()->Execute(
+            *stream, absl::Span<se::DeviceMemoryBase>(operands));
       });
 }
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index 21682123245b40..ddbbff9d361694 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -43,10 +43,9 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime/cudnn_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
@@ -756,7 +755,7 @@ class CuDnnCmd : public TracedCommandBufferCmd {
  public:
   CuDnnCmd(ExecutionStreamId execution_stream_id,
            absl::Span<const BufferAllocation::Slice> args,
-           const se::dnn::DnnGraph& graph);
+           std::shared_ptr<se::dnn::LazyDnnGraph> graph);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
@@ -771,7 +770,7 @@ class CuDnnCmd : public TracedCommandBufferCmd {
 
  private:
   std::vector<BufferAllocation::Slice> args_;
-  const se::dnn::DnnGraph& graph_;
+  const std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index 17596acd6a176b..e1974c17cfefa7 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/runtime/conditional_thunk.h"
 #include "xla/service/gpu/runtime/copy_thunk.h"
+#include "xla/service/gpu/runtime/cudnn_thunk.h"
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/runtime/gemm_thunk.h"
 #include "xla/service/gpu/runtime/kernel_thunk.h"
@@ -34,8 +35,8 @@ limitations under the License.
 #include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/runtime/replica_id_thunk.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/runtime/while_thunk.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
index 5bb327a4e60271..a5608355a79752 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index 7d7bdcd9aee4f3..d174c4733d8867 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
index 3878153d18b1d4..b1de1a0f86f87b 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
index d81003c899661f..c1c5f1feba64c9 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index 4472f5ef6b73f4..4a78b621b3dd4b 100644
--- a/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/runtime/command_buffer_allocations.h"
 #include "xla/service/gpu/runtime/command_buffer_cmd.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
index fb5a588fc21be0..8680126792fe07 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
index a7a7d3be0e0e37..0d8109b70c09b6 100644
--- a/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
index cc22a07c439dd6..3f9db4ea26660b 100644
--- a/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
index 4bdc2acf5545cd..9a8698b23f8b5c 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
index 9ad1a2943de683..521030ccee233d 100644
--- a/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
index 1e4fe32cd97db2..5b477dc5fb2a46 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/cub_sort_kernel.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
index 69cf643789b67f..12ee7a6dd1f3eb 100644
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
index 2a33ed605a1dcd..b156aa3228c24b 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.cc
@@ -15,29 +15,35 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/cudnn_thunk.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/status/status.h"
+#include "xla/stream_executor/dnn.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 namespace gpu {
 
-CuDnnThunk::CuDnnThunk(std::string serialized_graph, ThunkInfo thunk_info,
+CuDnnThunk::CuDnnThunk(std::string fingerprint, ThunkInfo thunk_info,
                        absl::Span<const KernelArgument> kernel_arguments)
     : Thunk(Kind::kCuDnn, std::move(thunk_info)),
-      serialized_graph_(std::move(serialized_graph)) {
+      fingerprint_(std::move(fingerprint)),
+      graph_(std::make_shared<se::dnn::LazyDnnGraph>(nullptr)) {
   args_.reserve(kernel_arguments.size());
   for (const KernelArgument& kernel_argument : kernel_arguments) {
     args_.push_back(kernel_argument.slice());
-  }
+  };
 }
 
 absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
   absl::Status ret = absl::OkStatus();
   absl::call_once(once_flag_, [&] {
-    auto result =
-        params.stream->parent()->AsDnn()->DeserializeGraph(serialized_graph_);
-    std::string().swap(serialized_graph_);
+    auto result = params.stream->parent()->AsDnn()->DeserializeGraph(
+        params.src.dnn_compiled_graphs.at(fingerprint_));
+    std::string().swap(fingerprint_);
     if (result.ok()) {
-      graph_ = std::move(*result);
+      graph_->swap(*result);
     }
     ret = result.status();
   });
@@ -45,13 +51,16 @@ absl::Status CuDnnThunk::Initialize(const InitializeParams& params) {
 }
 
 absl::Status CuDnnThunk::ExecuteOnStream(const ExecuteParams& params) {
+  InitializeParams initialize_params;
+  initialize_params.stream = params.stream;
+  TF_RETURN_IF_ERROR(Initialize(initialize_params));
   std::vector<se::DeviceMemoryBase> buffer_args;
   buffer_args.reserve(args_.size());
   for (const BufferAllocation::Slice& arg : args_) {
     buffer_args.push_back(params.buffer_allocations->GetDeviceAddress(arg));
   }
-  return graph_->Execute(*params.stream,
-                         absl::Span<se::DeviceMemoryBase>(buffer_args));
+  return graph_->get()->Execute(*params.stream,
+                                absl::Span<se::DeviceMemoryBase>(buffer_args));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
index be1204b9b65dc9..b1b99889522179 100644
--- a/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cudnn_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/kernel_arguments.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/dnn.h"
 
 namespace xla {
@@ -34,7 +34,7 @@ namespace gpu {
 // Wraps executable cuDNN graph objects.
 class CuDnnThunk : public Thunk {
  public:
-  CuDnnThunk(std::string serialized_graph, ThunkInfo,
+  CuDnnThunk(std::string fingerprint, ThunkInfo,
              absl::Span<const KernelArgument>);
   CuDnnThunk(const CuDnnThunk&) = delete;
   CuDnnThunk& operator=(const CuDnnThunk&) = delete;
@@ -43,15 +43,15 @@ class CuDnnThunk : public Thunk {
   absl::Status Initialize(const InitializeParams&) override;
   absl::Status ExecuteOnStream(const ExecuteParams&) override;
 
-  const se::dnn::DnnGraph& graph() const { return *graph_; }
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph() const { return graph_; }
   const std::vector<BufferAllocation::Slice>& arguments() const {
     return args_;
   }
 
  private:
   absl::once_flag once_flag_;
-  std::string serialized_graph_;
-  std::unique_ptr<se::dnn::DnnGraph> graph_;
+  std::string fingerprint_;
+  std::shared_ptr<se::dnn::LazyDnnGraph> graph_;
   std::vector<BufferAllocation::Slice> args_;
 };
 
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 28a7dcebfc1dfa..0eaf0aaf4c39a4 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -15,13 +15,21 @@ limitations under the License.
 
 #include "xla/service/gpu/runtime/custom_call_thunk.h"
 
+#include <cstdint>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/call_frame.h"
@@ -30,11 +38,12 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "xla/stream_executor/gpu/gpu_stream.h"
@@ -139,6 +148,8 @@ absl::Status CustomCallThunk::ExecuteFfiHandler(const ExecuteParams& params) {
   // execution context, as apparently it's not easily accessible from Thunk.
   ExecutableRunOptions run_options;
   run_options.set_stream(params.stream);
+  run_options.set_allocator(params.buffer_allocations->memory_allocator());
+  run_options.set_device_ordinal(params.buffer_allocations->device_ordinal());
   ServiceExecutableRunOptions service_run_options(run_options);
 
   CallOptions options = {&service_run_options, called_computation_};
@@ -149,5 +160,68 @@ absl::Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) {
   return handler_ ? ExecuteFfiHandler(params) : ExecuteCustomCall(params);
 }
 
+absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict) {
+  CustomCallThunk::AttributesMap attributes;
+  for (auto& kv : dict) {
+    std::string_view name = kv.getName().strref();
+
+    auto integer = [&](mlir::IntegerAttr integer) {
+      switch (integer.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<int32_t>(integer.getInt());
+          return absl::OkStatus();
+        case 64:
+          attributes[name] = static_cast<int64_t>(integer.getInt());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported integer attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto fp = [&](mlir::FloatAttr fp) {
+      switch (fp.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported float attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto arr = [&](mlir::DenseArrayAttr arr) {
+      if (auto dense = mlir::dyn_cast<mlir::DenseI32ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      } else if (auto dense = mlir::dyn_cast<mlir::DenseI64ArrayAttr>(arr)) {
+        attributes[name] = dense.asArrayRef().vec();
+        return absl::OkStatus();
+      }
+
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported array element type for attribute: ", name));
+    };
+
+    auto str = [&](mlir::StringAttr str) {
+      attributes[name] = str.getValue().str();
+      return absl::OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
+            .Case<mlir::IntegerAttr>(integer)
+            .Case<mlir::FloatAttr>(fp)
+            .Case<mlir::DenseArrayAttr>(arr)
+            .Case<mlir::StringAttr>(str)
+            .Default([&](mlir::Attribute) {
+              return absl::InvalidArgumentError(absl::StrCat(
+                  "Unsupported attribute type for attribute: ", name));
+            }));
+  }
+  return attributes;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index dd445a248935e0..02679d2e0d21ff 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 
@@ -120,6 +120,11 @@ class CustomCallThunk : public Thunk {
   const HloComputation* called_computation_ = nullptr;
 };
 
+// Converts MLIR dictionary attribute attached to a custom call operation to a
+// custom call thunk attributes that are forwarded to the FFI handler.
+absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/runtime/fft_thunk.h b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
index 278e946f8df0cc..ffd45ed804fda9 100644
--- a/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
index 0415f3cc53e14d..6e93541f09c123 100644
--- a/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_fused_mha_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
index cc74707c84e01f..4d46a78c4af583 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
index a134ed2623b1ae..58f13d33172bb4 100644
--- a/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 74316dcb511b21..a6aaabe5b31896 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/scratch_allocator.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 260feaee13671b..9c998393e6638e 100644
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
index 6bff3471d8ac43..7a3db689cd6a35 100644
--- a/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
index b482c1cf97a228..063940d8ab7cd4 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
index 1a4ad3b8f7d595..1b50ce6be28694 100644
--- a/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
index e8adf0cbaca6b7..e1eef4c39c5d3a 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 
 // This file contains thunks that set a buffer's elements to a particular value.
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index ba6c86a4ac05ef..3a9f2c1d75401b 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
index 562ba7840143c8..95512b9099116e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
index 1a2a8d17f0839c..47742afa87648f 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
@@ -30,8 +30,8 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
index df24d557087d08..7fedfee6786f5a 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
index 86394e84d1f021..623bf8095e2ff6 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -108,6 +108,8 @@ absl::Status RunAllToAll(NcclApi* nccl_api, bool has_split_dimension,
                          se::Stream& stream, NcclApi::NcclCommHandle comm) {
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing all-to-all from device ordinal: " << device_ordinal;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api, device_ordinal, buffers, comm));
 
   TF_ASSIGN_OR_RETURN(int32_t num_participants, nccl_api->CommCount(comm));
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
index 16860257da027b..083283388f03ad 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
index 78b940c3889e3e..79155dc95c687d 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
index ece85730a250d3..5f5f993085a91c 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_broadcast_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/status.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
index b815a37785e5bf..8eddb792f91cf9 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -173,6 +173,8 @@ absl::Status RunCollectivePermute(
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
           << device_ordinal << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api, device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> source_id = source_target.source;
   const std::optional<int64_t> target_id = source_target.target;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
index 6171ed4680b6dc..29498660195bef 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_permute_thunk.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
index ea12aaecb56f89..70d7b4351cc1fc 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 
 #include <cstdint>
 #include <cstdlib>
@@ -47,7 +47,7 @@ limitations under the License.
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
 #include "xla/status.h"
@@ -527,12 +527,12 @@ absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op) {
 
 absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op) {
   if (!LayoutUtil::IsDenseArray(shape)) {
-    return tsl::errors::Unimplemented(
+    return absl::AbortedError(
         absl::StrFormat("input is not a dense array: %s",
                         shape.ToString(/*print_layout=*/true)));
   }
   if (!IsTypeSupportedByNccl(shape.element_type(), reduction_op)) {
-    return tsl::errors::Unimplemented(absl::StrFormat(
+    return absl::AbortedError(absl::StrFormat(
         "element type %s not suppored by NCCL",
         primitive_util::LowercasePrimitiveTypeName(shape.element_type())));
   }
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
similarity index 98%
rename from third_party/xla/xla/service/gpu/nccl_collective_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
index 035bc4aafb46a1..7eadf2ec270029 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_collective_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
-#define XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -298,4 +298,4 @@ Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_COLLECTIVE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
index 819ef04d76e302..0a255fc3012ccc 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_p2p_thunk_common.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
index d9705bf8d741e0..a20b33813d966e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
@@ -93,6 +93,8 @@ absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing Recv from device ordinal: " << device_ordinal
           << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> source_id = source_target.source;
   se::DeviceMemoryBase dest_addr = buffer.destination_buffer;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
index ecec84a2abc8c7..587530c93128d1 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_recv_thunk.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
index 0184da6f5a9824..757f98745dd09e 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.cc
@@ -30,9 +30,9 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
@@ -93,6 +93,8 @@ absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
   int device_ordinal = stream.parent()->device_ordinal();
   VLOG(3) << "Performing collective permute from device ordinal: "
           << device_ordinal << "current_id " << current_id;
+  TF_RETURN_IF_ERROR(
+      MaybeRegisterBuffers(nccl_api(), device_ordinal, {buffer}, comm));
 
   const std::optional<int64_t> target_id = source_target.target;
   se::DeviceMemoryBase src_addr = buffer.source_buffer;
diff --git a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
index d384b6a435574b..30f63215a83a86 100644
--- a/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_send_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_collective_thunk.h"
 #include "xla/service/gpu/runtime/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
index 2346811a36ed43..602d504175fb3d 100644
--- a/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
index 5fbb3fe6a3e36d..a216431eb0a425 100644
--- a/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
index acf09dca0a7493..7b9aa403de1bc9 100644
--- a/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
index 7d6a3183069e6e..ba23806bbbaf18 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
index 07c92f72547f7e..2c235664e8188d 100644
--- a/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/event.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
index e872792fc5c467..143ad94a29071d 100644
--- a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "xla/service/gpu/runtime/annotation.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "tsl/platform/errors.h"
 #include "tsl/profiler/lib/scoped_annotation.h"
 
diff --git a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
index fe1abafe5df5d1..4642f08becadb5 100644
--- a/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/thunk.cc b/third_party/xla/xla/service/gpu/runtime/thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/thunk.cc
index 1019f027a36ec6..17af8b837b00bb 100644
--- a/third_party/xla/xla/service/gpu/thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/third_party/xla/xla/service/gpu/thunk.h b/third_party/xla/xla/service/gpu/runtime/thunk.h
similarity index 98%
rename from third_party/xla/xla/service/gpu/thunk.h
rename to third_party/xla/xla/service/gpu/runtime/thunk.h
index d7c06cbdb7284d..c9d037209062c9 100644
--- a/third_party/xla/xla/service/gpu/thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_THUNK_H_
-#define XLA_SERVICE_GPU_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_THUNK_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -142,6 +142,9 @@ class Thunk {
     kCuDnn
   };
 
+  // <HLO computation fingerprint, serialized compiled object>.
+  using BinaryMap = absl::flat_hash_map<std::string, std::string>;
+
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
   // for now we keep it here as a Thunk implementation detail. It's not yet
   // clear what else should become a part of "executable source", we likely
@@ -149,6 +152,7 @@ class Thunk {
   struct ExecutableSource {
     std::string_view text;             // PTX for NVIDIA backend
     absl::Span<const uint8_t> binary;  // CUBIN for NVIDIA backends
+    BinaryMap dnn_compiled_graphs;
   };
 
   struct ThunkInfo {
@@ -454,4 +458,4 @@ bool IsReductionCollective(Thunk::Kind kind);
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
index eb7111888190b6..83b97ae2438199 100644
--- a/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
index a63cf493c1a461..2bd961264ee12a 100644
--- a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "tsl/platform/errors.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
index c30216ba80c5b9..2a545e01738657 100644
--- a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index 0f933aad078f50..cbd9ce3134133e 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/service/gpu/runtime/while_thunk.h b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
index ec48b0c5714b04..e1a06c96308590 100644
--- a/third_party/xla/xla/service/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index df771c4df85d32..2db88bc59379a1 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -59,38 +59,6 @@ bool HasDefaultLayout(const Shape& shape) {
          LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
 }
 
-bool IsTritonSupportedInstruction(const HloInstruction* instr,
-                                  const se::GpuComputeCapability& gpu_version) {
-  if (!instr->shape().IsArray()) {
-    return false;
-  }
-
-  if (!IsTritonSupportedDataType(instr->shape().element_type(), gpu_version)) {
-    return false;
-  }
-
-  for (const HloInstruction* operand : instr->operands()) {
-    if (!IsTritonSupportedDataType(operand->shape().element_type(),
-                                   gpu_version)) {
-      return false;
-    }
-  }
-
-  // TODO(bchetioui): expand with non-trivial instructions.
-  if (instr->IsElementwise()) {
-    return IsTritonSupportedElementwise(instr->opcode(),
-                                        instr->shape().element_type());
-  }
-
-  switch (instr->opcode()) {
-    case HloOpcode::kBitcast:
-    case HloOpcode::kParameter:
-      return true;
-    default:
-      return false;
-  }
-}
-
 // Returns true if a trivially connected producer of 'consumer' with opcode
 // 'opcode' exists. If such an instruction is found, the value of 'producer' is
 // set to it. The definition of "trivial" operations is as given in
@@ -268,7 +236,7 @@ bool IsTriviallyFusible(HloInstruction* instr,
   }
 
   if (instr->IsElementwise() && instr->operand_count() == 1) {
-    return IsTritonSupportedInstruction(instr, gpu_version);
+    return static_cast<bool>(IsTritonSupportedInstruction(*instr, gpu_version));
   }
 
   // Elementwise binary ops are trivially fusible if the operands are the same,
@@ -280,7 +248,8 @@ bool IsTriviallyFusible(HloInstruction* instr,
     // Elementwise binary ops should be fused if both operands are the same and
     // if the operand is triton supported.
     if (operand_0 == operand_1) {
-      return IsTritonSupportedInstruction(instr, gpu_version);
+      return static_cast<bool>(
+          IsTritonSupportedInstruction(*instr, gpu_version));
     }
 
     // For simplicity we only fuse elementwise binary ops with splat operands
@@ -291,7 +260,8 @@ bool IsTriviallyFusible(HloInstruction* instr,
          IsSupportedBroadcastOfParameter(*operand_0)) ^
         (IsBroadcastOfScalarConstant(*operand_1) ||
          IsSupportedBroadcastOfParameter(*operand_1))) {
-      return IsTritonSupportedInstruction(instr, gpu_version);
+      return static_cast<bool>(
+          IsTritonSupportedInstruction(*instr, gpu_version));
     }
   }
 
@@ -337,14 +307,6 @@ bool IsTriviallyConnectedProducerOf(
   return false;
 }
 
-bool IsTritonSupportedComputation(const HloComputation* computation,
-                                  const se::GpuComputeCapability& gpu_version) {
-  return absl::c_all_of(
-      computation->instructions(), [&](const HloInstruction* instr) {
-        return IsTritonSupportedInstruction(instr, gpu_version);
-      });
-}
-
 // Finds the first non-fusible producer of a diamond. This instruction is either
 //   1. the direct producer of the diamond, if that producer is used more than
 //      twice and/or is not otherwise trivially fusible
@@ -447,7 +409,7 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
     return "Root is not elementwise binary.";
   }
 
-  if (!IsTritonSupportedInstruction(instr, gpu_version_)) {
+  if (!IsTritonSupportedInstruction(*instr, gpu_version_)) {
     return "Root is not supported for Triton instruction.";
   }
 
@@ -471,13 +433,11 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
     return "Broadcast or reduce have non-default layouts.";
   }
 
-  if (!(reduce->operand_count() == 2 &&
-        reduce->operand(1)->opcode() == HloOpcode::kConstant)) {
-    return "Reduce has a non-constant second operand and/or is variadic.";
-  }
-
-  if (!(IsTritonSupportedComputation(reduce->to_apply(), gpu_version_))) {
-    return "Unsupported reduction by Triton.";
+  if (CodegenDecision is_supported =
+          IsTritonSupportedInstruction(*reduce, gpu_version_);
+      !is_supported) {
+    VLOG(3) << is_supported.Explain();
+    return is_supported;
   }
 
   if (!HasOneUse(broadcast) || !HasOneUse(reduce)) {
@@ -486,11 +446,6 @@ SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
 
   producer = reduce->mutable_operand(0);
 
-  if (reduce->dimensions().size() != 1 ||
-      reduce->dimensions(0) != producer->shape().rank() - 1) {
-    return "Reduction is not a row-reduction of a single operand.";
-  }
-
   if (absl::c_linear_search(broadcast->dimensions(),
                             broadcast->shape().rank() - 1)) {
     return "Broadcast is not along the reduction dimension.";
@@ -670,6 +625,13 @@ absl::Status SoftmaxRewriterTriton::FuseDiamondChain(
 absl::StatusOr<bool> SoftmaxRewriterTriton::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version_);
+  if (!cuda_compute_capability || !cuda_compute_capability->IsAtLeastAmpere()) {
+    return absl::FailedPreconditionError(
+        "Triton support is only enabled for Ampere GPUs and up.");
+  }
+
   std::vector<DiamondChainDescriptor> diamond_chains =
       FindAllFusibleDiamondChains(*module, execution_threads);
 
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 0683f405e30f6e..74e800f9a815cc 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -1,8 +1,11 @@
 /* Copyright 2023 The OpenXLA Authors.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
+
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -1073,9 +1076,8 @@ ENTRY main {
               GmockMatch(m::Fusion(m::Parameter())));
 }
 
-TEST_P(
-    SoftmaxRewriterTritonTest,
-    CanOnlyFuseConvertInvolvingBF16InputIntoSoftmaxDiamondWithAtLeastAmpereComputeCapability) {  // NOLINT(whitespace/line_length)
+TEST_P(SoftmaxRewriterTritonTest,
+       CanFuseConvertInvolvingBF16InputIntoSoftmaxDiamond) {
   PrimitiveType data_type = GetParam();
   const std::string hlo_string_template = R"(
 HloModule softmax
@@ -1091,52 +1093,51 @@ ENTRY main {
   reduce = $0[127]{0} reduce(param_0_$0, constant_neg_inf), dimensions={1}, to_apply=max_computation
   broadcast = $0[127,125]{1,0} broadcast(reduce), dimensions={0}
   ROOT subtract = $0[127,125]{1,0} subtract(param_0_$0, broadcast)
-}
-)";
+})";
   const std::string hlo_string =
       absl::Substitute(hlo_string_template,
                        primitive_util::LowercasePrimitiveTypeName(data_type));
 
-  auto ampere_module = ParseAndReturnVerifiedModule(hlo_string).value();
-  auto volta_module = ampere_module->Clone();
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
 
-  // Ampere
   EXPECT_TRUE(
       SoftmaxRewriterTritonMatchAndRewrite(
           se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0},
-          ampere_module.get())
+          module.get())
           .value());
-  EXPECT_TRUE(verifier().Run(ampere_module.get()).status().ok());
-  VLOG(2) << ampere_module->ToString();
-  EXPECT_THAT(ampere_module->entry_computation()->root_instruction(),
+  EXPECT_TRUE(verifier().Run(module.get()).status().ok());
+  VLOG(2) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Fusion(m::Parameter())));
+}
 
-  // Volta (pre-Ampere)
-  VLOG(2) << volta_module->ToString();
+TEST_F(SoftmaxRewriterTritonTest, RewriterBailsOutOnPreAmpereGpu) {
+  const std::string hlo_string = R"(
+HloModule softmax
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = bf16[127,125]{1,0} parameter(0)
+  param_0_f32 = f32[127,125]{1,0} convert(param_0)
+  constant_neg_inf = f32[] constant(-inf)
+  reduce = f32[127]{0} reduce(param_0_f32, constant_neg_inf), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0_f32, broadcast)
+})";
 
-  switch (data_type) {
-    case F32:
-    case F16:
-      EXPECT_TRUE(
-          SoftmaxRewriterTritonMatchAndRewrite(
-              se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0},
-              volta_module.get())
-              .value());
-      EXPECT_TRUE(verifier().Run(volta_module.get()).status().ok());
-      EXPECT_THAT(volta_module->entry_computation()->root_instruction(),
-                  GmockMatch(m::Fusion(m::Convert(m::Parameter()))));
-      break;
-    case BF16:
-      // When bf16 is used, no fusion is possible on Volta.
-      EXPECT_FALSE(
-          SoftmaxRewriterTritonMatchAndRewrite(
-              se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0},
-              volta_module.get())
-              .value());
-      break;
-    default:
-      ABSL_UNREACHABLE();
-  }
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+
+  EXPECT_THAT(
+      SoftmaxRewriterTriton(
+          se::CudaComputeCapability{se::CudaComputeCapability::VOLTA, 0})
+          .Run(module.get()),
+      tsl::testing::StatusIs(
+          tsl::error::FAILED_PRECONDITION,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
 }
 
 TEST_P(SoftmaxRewriterTritonTest, DoesNotFuseConvertWithC64DataType) {
@@ -1769,10 +1770,11 @@ ENTRY main {
     if (std::holds_alternative<FusionDecision>(decision)) {
       std::string actual_decision =
           std::get<FusionDecision>(decision).Explain();
-      EXPECT_THAT(actual_decision,
-                  AnyOf(HasSubstr("Root is not elementwise binary"),
-                        HasSubstr("Reduce has a non-constant second operand "
-                                  "and/or is variadic")));
+      EXPECT_THAT(
+          actual_decision,
+          AnyOf(HasSubstr("Root is not elementwise binary"),
+                HasSubstr("Reduction init value should be a constant or a "
+                          "convert of a constant.")));
       unmatched++;
     } else {
       matched++;
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 4e1bdd131324ca..2ff84c2c61ea91 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -244,7 +244,8 @@ absl::Status MakeDotComputationSplitKBatch(
   const DotDimensionNumbers& old_dim_numbers = dot->dot_dimension_numbers();
   DotDimensionNumbers new_dim_numbers;
 
-  const int64_t lhs_contracting_idx = ContractingDimensionIndex(*dot, 0);
+  TF_ASSIGN_OR_RETURN(const int64_t lhs_contracting_idx,
+                      ContractingDimensionIndex(*dot, 0));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.lhs_contracting_dimensions(),
       *new_dim_numbers.mutable_lhs_contracting_dimensions(),
@@ -254,7 +255,8 @@ absl::Status MakeDotComputationSplitKBatch(
       old_dim_numbers.lhs_batch_dimensions(),
       *new_dim_numbers.mutable_lhs_batch_dimensions(), lhs_contracting_idx);
 
-  const int64_t rhs_contracting_idx = ContractingDimensionIndex(*dot, 1);
+  TF_ASSIGN_OR_RETURN(const int64_t rhs_contracting_idx,
+                      ContractingDimensionIndex(*dot, 1));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.rhs_contracting_dimensions(),
       *new_dim_numbers.mutable_rhs_contracting_dimensions(),
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
index 37e118678db035..0bfa2cef837e2d 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator.cc
@@ -25,8 +25,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -46,7 +47,6 @@ bool IsOnlyRootNonDefaultStream(HloComputation* computation) {
   int64_t root_stream_id = root_gpu_config->operation_queue_id();
   VLOG(2) << "Found fusion computation's root stream id to be "
           << root_stream_id;
-
   if (root_stream_id == Thunk::kDefaultExecutionStreamId.value()) {
     return false;
   }
@@ -88,6 +88,20 @@ absl::StatusOr<bool> AnnotateStreamAttributesForInstruction(
   return true;
 }
 
+absl::StatusOr<bool> AnnotateStreamAttributesForCopyStart(
+    HloInstruction* instr, int64_t channel_id,
+    GpuBackendConfig& instr_gpu_config) {
+  // Do nothing if copy-start has already been annotated
+  if (instr_gpu_config.operation_queue_id() !=
+      Thunk::kDefaultExecutionStreamId.value()) {
+    return false;
+  }
+  instr_gpu_config.set_operation_queue_id(channel_id);
+  TF_RETURN_IF_ERROR(instr->set_backend_config(instr_gpu_config));
+  VLOG(3) << "Add copy-start's backend config: " << channel_id;
+  return true;
+}
+
 absl::StatusOr<bool> AnnotateStreamAttributesForUsers(
     HloInstruction* instr, GpuBackendConfig& instr_gpu_config) {
   bool changed = false;
@@ -125,6 +139,7 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
   XLA_VLOG_LINES(
       5, "StreamAttributeAnnotator::Run(), before:\n" + module->ToString());
   bool changed = false;
+  int64_t channel_id = hlo_query::NextChannelId(*module);
   for (const HloComputation* comp : module->computations(execution_threads)) {
     for (HloInstruction* instr : comp->MakeInstructionPostOrder()) {
       auto instr_gpu_config = instr->backend_config<GpuBackendConfig>();
@@ -139,6 +154,12 @@ absl::StatusOr<bool> StreamAttributeAnnotator::Run(
                             AnnotateStreamAttributesForInstruction(
                                 instr, instr_gpu_config.value()));
         changed |= comp_result;
+      } else if (instr->opcode() == HloOpcode::kCopyStart) {
+        TF_ASSIGN_OR_RETURN(bool comp_result,
+                            AnnotateStreamAttributesForCopyStart(
+                                instr, channel_id, instr_gpu_config.value()));
+        changed |= comp_result;
+        continue;
       }
 
       TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
index d5e2629712f7ab..2861f9a82a7ef1 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_annotator_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -164,5 +165,48 @@ TEST_F(StreamAttributeAnnotatorTest, FusionIsAnnotated) {
   EXPECT_EQ(gpu_config.operation_queue_id(), 1);
 }
 
+TEST_F(StreamAttributeAnnotatorTest, CopyStartIsAnnotated) {
+  constexpr absl::string_view kHloString = R"(
+  HloModule offloading
+    ENTRY %main (param_0: f32[1024], param_1: f32[1024]) -> f32[1024] {
+    %param_1 = f32[1024]{0} parameter(1)
+    %param_0 = f32[1024]{0} parameter(0)
+    %res_3 = f32[1024]{0} add(f32[1024]{0} %param_0, f32[1024]{0} %param_1)
+    %copy-start = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_3)
+    %res_4 = f32[1024]{0} tanh(f32[1024]{0} %res_3)
+    %copy-start.2 = (f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) copy-start(f32[1024]{0} %res_4)
+    %res_5 = f32[1024]{0} tanh(f32[1024]{0} %res_4)
+    %copy-done = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start)
+    %res_6 = f32[1024]{0} tanh(f32[1024]{0} %res_5)
+    %copy-done.2 = f32[1024]{0:S(5)} copy-done((f32[1024]{0:S(5)}, f32[1024]{0}, u32[]) %copy-start.2)
+    %copy-start.3 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done.2)
+    %res_7 = f32[1024]{0} add(f32[1024]{0} %res_6, f32[1024]{0} %res_6)
+    %copy-start.1 = (f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) copy-start(f32[1024]{0:S(5)} %copy-done)
+    %res_8 = f32[1024]{0} add(f32[1024]{0} %res_7, f32[1024]{0} %res_5)
+    %copy-done.3 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.3)
+    %res_9 = f32[1024]{0} add(f32[1024]{0} %res_8, f32[1024]{0} %copy-done.3)
+    %copy-done.1 = f32[1024]{0} copy-done((f32[1024]{0}, f32[1024]{0:S(5)}, u32[]) %copy-start.1)
+    %res_10 = f32[1024]{0} add(f32[1024]{0} %res_9, f32[1024]{0} %copy-done.1)
+    ROOT %res_11 = f32[1024]{0} tanh(f32[1024]{0} %res_10)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloString));
+
+  StreamAttributeAnnotator attr_annotator;
+  bool changed;
+  TF_ASSERT_OK_AND_ASSIGN(changed, attr_annotator.Run(module.get()));
+  EXPECT_TRUE(changed);
+
+  for (std::string i : {"", ".1", ".2", ".3"}) {
+    const HloInstruction* cp_start =
+        FindInstruction(module.get(), "copy-start" + i);
+    EXPECT_TRUE(cp_start->has_backend_config());
+    TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
+                            cp_start->backend_config<GpuBackendConfig>());
+    EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+  }
+}
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
index 78d59e8d2fa597..822c6473dba483 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper.cc
@@ -15,14 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/stream_attribute_async_wrapper.h"
 
-
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/runtime/thunk.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
@@ -43,6 +42,12 @@ static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
       computation->CreateAsyncInstructions(
           instr, {}, StreamAttributeAsyncWrapper::kParallelExecutionThread,
           /*replace=*/true));
+  TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
+                      done->backend_config<GpuBackendConfig>());
+  // Set the false delay of done op to be false so it can be scheduled
+  // far apart from start.
+  gpu_config.set_force_earliest_schedule(false);
+  TF_RETURN_IF_ERROR(done->set_backend_config(gpu_config));
   VLOG(5) << "Created async instruction: " << done->ToString();
   return true;
 }
diff --git a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
index d8da1fd47a4bc9..8b3dcb23eac7bc 100644
--- a/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_attribute_async_wrapper_test.cc
@@ -40,7 +40,7 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   ENTRY entry {
     p1_32 = f32[1] parameter(0)
     p2_32 = f32[1] parameter(1)
-    add_32 = f32[1] add(p1_32, p2_32), backend_config={"operation_queue_id":"1", "wait_on_operation_queues":[]}
+    add_32 = f32[1] add(p1_32, p2_32), backend_config={"operation_queue_id":"1", "wait_on_operation_queues":[], "force_earliest_schedule":true}
     ROOT exp_32 = f32[1] exponential(add_32), backend_config={"operation_queue_id":"0", "wait_on_operation_queues":[1]}
   }
   )";
@@ -55,6 +55,11 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   const HloInstruction* producer =
       module->entry_computation()->root_instruction()->operand(0);
   EXPECT_EQ(producer->opcode(), HloOpcode::kAsyncDone);
+  // Verify that the force_earliest_schedule is set to false for the done op.
+  TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig done_gpu_config,
+                          producer->backend_config<GpuBackendConfig>());
+  EXPECT_EQ(done_gpu_config.force_earliest_schedule(), false);
+
   const HloInstruction* producer_start = producer->operand(0);
   EXPECT_EQ(producer_start->opcode(), HloOpcode::kAsyncStart);
 
@@ -65,6 +70,7 @@ TEST_F(StreamAttributeAsyncWrapperTest, NonDefaultOpIsWrapped) {
   TF_ASSERT_OK_AND_ASSIGN(GpuBackendConfig gpu_config,
                           async->backend_config<GpuBackendConfig>());
   EXPECT_EQ(gpu_config.operation_queue_id(), 1);
+  EXPECT_EQ(gpu_config.force_earliest_schedule(), true);
   EXPECT_EQ(async->async_execution_thread(), "parallel");
 }
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index b6ae3f29182859..4827fc251ce087 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -56,12 +56,12 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/util/env_var.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 #include "xla/util.h"
 #include "tsl/platform/ml_dtypes.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
-#include "tsl/util/proto/proto_utils.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index 558de0ec3604ba..cb3be24a6ceaa6 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "xla/autotuning.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "tsl/util/proto/proto_utils.h"
+#include "xla/tsl/util/proto/proto_utils.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index e6058255e99a01..5f8db7fc1f2df5 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -1,12 +1,6 @@
 # Description: GPU-specific XLA tests. For example, codegen tests that
 # verify the IR emitted.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -21,6 +15,12 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -442,6 +442,31 @@ xla_cc_test(
     ],
 )
 
+xla_test(
+    name = "gpu_triton_custom_call_test",
+    srcs = ["gpu_triton_custom_call_test.cc"],
+    backends = [
+        "gpu_a100",
+        "gpu_v100",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 0ee2481cfed26a..874d602bb4914d 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -1479,6 +1479,67 @@ class LegacyCublasGemmRewriteTest : public GemmRewriteTest {
   }
 };
 
+TEST_F(LegacyCublasGemmRewriteTest, MatrixVectorMultiplication) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f32[2048] parameter(0)
+  p1 = f32[2048, 16384] parameter(1)
+  ROOT d = f32[16384] dot(p0, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::AMPERE, 0}),
+                            R"(
+; CHECK:  %[[P0:.+]] = f32[2048]{0} parameter(0)
+; CHECK:  %[[P1:.+]] = f32[2048,16384]{1,0} parameter(1)
+; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[16384]{0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
+)");
+}
+
+TEST_F(LegacyCublasGemmRewriteTest, MatrixVectorMultiplicationWithBatch) {
+  const char* hlo_text = R"(
+HloModule m
+
+ENTRY e {
+  p0 = f32[10, 10, 2048] parameter(0)
+  p1 = f32[10, 10, 2048, 16384] parameter(1)
+  ROOT d = f32[10, 10, 16384] dot(p0, p1),
+   lhs_batch_dims={0, 1}, rhs_batch_dims={0, 1},
+   lhs_contracting_dims={2}, rhs_contracting_dims={2}
+})";
+
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::AMPERE, 0}),
+                            R"(
+; CHECK:  %[[P0:.+]] = f32[10,10,2048]{2,1,0} parameter(0)
+; CHECK:  %[[P1:.+]] = f32[10,10,2048,16384]{3,2,1,0} parameter(1)
+; CHECK:  %[[CUSTOM_CALL:.+]] = (f32[10,10,16384]{2,1,0}, s8[4194304]{0}) custom-call(%[[P0]], %[[P1]]), custom_call_target="__cublas$gemm"
+)");
+}
+
+TEST_F(LegacyCublasGemmRewriteTest, SparseDotNotSupported) {
+  const char* hlo_text = R"(
+HloModule test
+
+ENTRY main {
+  lhs = f16[5,16] parameter(0)
+  rhs = f16[32,10] parameter(1)
+  meta = u16[5,2] parameter(2)
+  ROOT dot = f32[5,10] dot(lhs, rhs, meta),
+      lhs_contracting_dims={1}, rhs_contracting_dims={0}, sparsity=L.1@2:4
+})";
+  auto hlo_pass = GemmRewriter(
+      se::CudaComputeCapability{se::CudaComputeCapability::AMPERE, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&hlo_pass, module.get()));
+  EXPECT_FALSE(changed);
+}
+
 // Test that the alpha and beta fields of the GemmBackendConfig are updated.
 // A bias must be present for the beta value to be set.
 // In order to have a bias add fused, the bias term must be overwritable.
@@ -4627,9 +4688,10 @@ class ParameterizedFp8GemmRewriteTest : public ParameterizedGemmRewriteTest {
     }
   }
 
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text, int64_t replica_count = 1,
-      int64_t num_partitions = 1) {
+  absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
+  ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                               int64_t replica_count = 1,
+                               int64_t num_partitions = 1) {
     return GemmRewriteTest::ParseAndReturnVerifiedModule(
         absl::StrReplaceAll(hlo_text, replacements_));
   }
@@ -4779,6 +4841,65 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
       )");
 }
 
+// Do not fuse FP8 matrix bias.
+TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDMatrixBiasF8) {
+  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+    GTEST_SKIP() << "UnscaledABUnscaledDMatrixBiasF8 is currently not supported on ROCm";
+  }
+
+#if GOOGLE_CUDA && CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION < 60000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in ROCm 6.0 and above.";
+#endif  // TF_ROCM_VERSION < 60000
+
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = <<F8E4M3>>[16,32] parameter(0)
+      y = <<F8E4M3>>[32,16] parameter(1)
+      dot_a = <<F8E4M3>>[16,16] dot(x, y), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      b = <<F8E4M3>>[16,16] parameter(2)
+      ROOT out = <<F8E4M3>>[16,16] add(dot_a, b)
+          }
+
+)";
+
+  CheckFp8IfSupported(hlo_text);
+  RunAndFilecheckHloRewrite(
+      hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
+      R"(
+; CHECK-LABEL: ENTRY %test ({{.*}}: <<F8E4M3>>[16,32], {{.*}}: <<F8E4M3>>[32,16], {{.*}}: <<F8E4M3>>[16,16]) -> <<F8E4M3>>[16,16] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} parameter(0)
+; CHECK-NEXT:    [[P1:%[^ ]+]] = <<F8E4M3>>[32,16]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = <<F8E4M3>>[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
+; CHECK-GCN-NEXT:    [[DOT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK-PTX-NEXT:    [[DOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+; CHECK-NEXT:    [[P2:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} parameter(2)
+; CHECK-NEXT:    [[ROOT:%[^ ]+]] = <<F8E4M3>>[16,16]{1,0} add([[DOT]], [[P2]])
+      )");
+}
+
 TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 #if GOOGLE_CUDA && CUDA_VERSION < 12000
   GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
@@ -5425,6 +5546,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
+
+// Fusing gelu into FP8 cublas matmuls is disabled on CUDA versions less
+// than 12.4.
+#if (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
   RunAndFilecheckHloRewrite(
       hlo_text, GemmRewriter(CudaHopperOrRocmMI300(), /*f8_rewrite=*/true),
       R"(
@@ -5458,6 +5583,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-GCN-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
       )");
+#endif  // (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest,
@@ -5505,6 +5631,10 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 )";
 
   CheckFp8IfSupported(hlo_text);
+
+// Fusing gelu into FP8 cublas matmuls is disabled on CUDA versions less
+// than 12.4.
+#if (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
   // Currently, hipBlasLt does not support output datatype bf16 for fp8 matmul.
   // And no fusion was done for such cases.
   RunAndFilecheckHloRewrite(
@@ -5539,6 +5669,7 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-GCN-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
       )");
+#endif  // (GOOGLE_CUDA && CUDA_VERSION >= 12040) || TENSORFLOW_USE_ROCM
 }
 
 TEST_P(ParameterizedFp8GemmRewriteTest, InvScaledABUnscaledDF8) {
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
new file mode 100644
index 00000000000000..52351018c743bb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/gpu_triton_custom_call_test.cc
@@ -0,0 +1,251 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace gpu {
+
+using ::mlir::ArrayRef;
+using ::mlir::NamedAttribute;
+
+namespace {
+
+std::unique_ptr<HloInstruction> CreateAddTritonCustomCall(
+    Shape tuple_shape, HloInstruction* param_0, HloInstruction* param_1) {
+  mlir::MLIRContext context_;
+  mlir::Builder builder(&context_);
+
+  // Create the backend_config for the triton custom call.
+  const std::string kMLIRText = R"(
+  module {
+    tt.func public @add_one(%arg0: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg1: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg2: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}, %arg3: !tt.ptr<f32, 1> {tt.divisibility = 32 : i32}) {
+      %0 = tt.get_program_id x : i32
+      %1 = tt.load %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %2 = tt.load %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      %cst = arith.constant 1.000000e+00 : f32
+      %3 = arith.addf %1, %cst : f32
+      %4 = tt.load %arg2 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg2, %3 {cache = 1 : i32, evict = 1 : i32} : f32
+      %5 = tt.load %arg3 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : f32
+      tt.store %arg3, %2 {cache = 1 : i32, evict = 1 : i32} : f32
+      tt.return
+    }
+  }
+  )";
+
+  NamedAttribute name =
+      builder.getNamedAttr("name", builder.getStringAttr("add_one"));
+  NamedAttribute ir =
+      builder.getNamedAttr("ir", builder.getStringAttr(kMLIRText));
+  NamedAttribute num_stages =
+      builder.getNamedAttr("num_stages", builder.getI32IntegerAttr(3));
+  NamedAttribute num_warps =
+      builder.getNamedAttr("num_warps", builder.getI32IntegerAttr(4));
+  NamedAttribute grid_x =
+      builder.getNamedAttr("grid_x", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_y =
+      builder.getNamedAttr("grid_y", builder.getI32IntegerAttr(1));
+  NamedAttribute grid_z =
+      builder.getNamedAttr("grid_z", builder.getI32IntegerAttr(1));
+  NamedAttribute debug =
+      builder.getNamedAttr("debug", builder.getBoolAttr(false));
+
+  std::vector<NamedAttribute> attributes = {
+      name, ir, num_stages, num_warps, grid_x, grid_y, grid_z, debug};
+  ArrayRef<NamedAttribute> attributesRef(attributes);
+  mlir::DictionaryAttr backend_config =
+      mlir::DictionaryAttr::get(&context_, attributesRef);
+
+  // Parse the backend_config into a string.
+  std::string backend_config_str;
+  llvm::raw_string_ostream(backend_config_str) << backend_config;
+
+  return HloInstruction::CreateCustomCall(tuple_shape, {param_0, param_1},
+                                          "__gpu$xla.gpu.triton",
+                                          backend_config_str);
+}
+
+}  // namespace
+
+class GpuIrEmitterUnnestedTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+};
+
+TEST_F(GpuIrEmitterUnnestedTest,
+       EmitTritonCustomCallWithCorrectLoweringAndWithoutNoaliasOrAlignment) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  // Tests that the lowering of a Triton custom call produces the correct LLVM
+  // IR, and that the arguments do not specify noalias or alignment attributes.
+
+  HloComputation::Builder computation_builder(TestName());
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  // Check that the compiled llvm ir matches the expected lowering of our tt ir.
+  // We check that the arguments do not specify noalias or alignment attributes,
+  // as this prevents recompilation based on the alignment of the input buffers.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK: @add_one
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg0
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg1
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg2
+; CHECK-NOT: noalias align
+; CHECK-SAME: dereferenceable(4) %arg3
+; CHECK-DAG:  addrspacecast ptr %arg0 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg1 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg2 to ptr addrspace(1)
+; CHECK-DAG:  addrspacecast ptr %arg3 to ptr addrspace(1)
+; CHECK: tail call i32 asm sideeffect
+; CHECK: tail call i32 asm sideeffect
+; CHECK: fadd float
+; CHECK-SAME: 1.000000e+00
+; CHECK-DAG: tail call void asm sideeffect
+; CHECK-DAG: tail call void asm sideeffect
+; CHECK:    ret void
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+TEST_F(GpuIrEmitterUnnestedTest, CanNotEmitTritonCustomCallOnPreAmpereGpu) {
+  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Running on Ampere or more recent GPU, skipping.";
+  }
+
+  HloComputation::Builder computation_builder(TestName());
+
+  // Create parameters and custom call in the computation builder.
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+
+  EXPECT_THAT(
+      CompileToExecutable(std::move(module), /*run_optimization_passes=*/false),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kFailedPrecondition,
+          ::testing::StrEq(
+              "Triton support is only enabled for Ampere GPUs and up.")));
+}
+
+class TritonCustomCallTest : public HloTestBase {};
+
+TEST_F(TritonCustomCallTest, NoArgumentDeduplication) {
+  if (auto cc = backend()
+                    .default_stream_executor()
+                    ->GetDeviceDescription()
+                    .cuda_compute_capability();
+      !cc.IsAtLeastAmpere()) {
+    GTEST_SKIP() << "Triton support is only enabled for Ampere GPUs and up.";
+  }
+
+  // Tests that no argument deduplication is done for Triton kernels.
+  //
+  // Triton kernels are compiled on the first call and re-used for all the
+  // following calls. So, if we are unlucky, we could end up calling the
+  // compiled kernel with fewer arguments than it expects in the presence
+  // of argument deduplication.
+  //
+  // For example,
+  //
+  //  * The first call is f(x, y). The arguments are distinct, no deduplication
+  //    is done at compilation time and the compiled kernel expects two
+  //    arguments.
+  //  * The second call is f(x, x). The arguments are deduplicated and we
+  //    call the previously compiled kernel with just x, causing a crash.
+
+  HloComputation::Builder computation_builder(TestName());
+
+  Shape scalar_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  HloInstruction* param_0 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "arg_0"));
+
+  HloInstruction* param_1 = computation_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "arg_1"));
+
+  auto* instr_0 = computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, param_0, param_1));
+  computation_builder.AddInstruction(
+      CreateAddTritonCustomCall(tuple_shape, instr_0, instr_0));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(computation_builder.Build());
+  EXPECT_TRUE(Run(std::move(module), /*run_hlo_passes=*/false));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 279c96b02a512e..7af7b2a484188c 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -60,14 +61,17 @@ using triton_fusion::TransformDirection;
 
 namespace triton_fusion {
 
-/*static*/ FusionContext FusionContext::FromDotOperand(
+/*static*/ absl::StatusOr<FusionContext> FusionContext::FromDotOperand(
     const HloInstruction& dot, const int operand_number, const int split_k) {
   // There can be either none or one split-K batch dimension.
   const int num_split_k_batch_dims = split_k > 1;
   int split_k_dimension_index = kNoDimensionIndex;
+  TF_ASSIGN_OR_RETURN(int contracting_dimension_index,
+                      ContractingDimensionIndex(dot, operand_number));
+  TF_ASSIGN_OR_RETURN(int non_contracting_dimension_index,
+                      NonContractingDimensionIndex(dot, operand_number));
   if (split_k > 1) {
-    split_k_dimension_index =
-        ContractingDimensionIndex(dot, operand_number) - 1;
+    split_k_dimension_index = contracting_dimension_index - 1;
   }
   int splittable_dimension_index = kNoDimensionIndex;
   // LHS non-contracting dimension can be split if non-splitK batch is absent.
@@ -75,14 +79,11 @@ namespace triton_fusion {
       dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
               num_split_k_batch_dims ==
           0) {
-    splittable_dimension_index =
-        NonContractingDimensionIndex(dot, operand_number);
+    splittable_dimension_index = non_contracting_dimension_index;
   }
-  FusionContext context(
-      DotProperties{
-          static_cast<int>(NonContractingDimensionIndex(dot, operand_number)),
-          splittable_dimension_index},
-      DotRequirements(kNoSplitRequirement));
+  FusionContext context(DotProperties{non_contracting_dimension_index,
+                                      splittable_dimension_index},
+                        DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -279,7 +280,8 @@ absl::Status TritonFusionAnalysis::ExecuteForDotFusion(
     if (dot.operand_count() < operand_number + 1) {
       continue;  // Meta scope is optional.
     }
-    auto context = FusionContext::FromDotOperand(dot, operand_number, split_k);
+    TF_ASSIGN_OR_RETURN(auto context, FusionContext::FromDotOperand(
+                                          dot, operand_number, split_k));
     TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
         *dot.operand(operand_number), parameters_[scope], iter_specs_[scope]));
     if (scope == Scope::LHS) {
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
index 295611388fae37..8459f86c8ffcea 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.h
@@ -97,8 +97,9 @@ class FusionContext {
  public:
   // Create fusion context from a dot operand according to
   // the currently supported configurations.
-  static FusionContext FromDotOperand(const HloInstruction& dot,
-                                      int operand_number, int split_k = 1);
+  static absl::StatusOr<FusionContext> FromDotOperand(const HloInstruction& dot,
+                                                      int operand_number,
+                                                      int split_k = 1);
 
   // Create fusion context from dot's output.
   static FusionContext FromDotOutput(const HloInstruction& dot, int split_k,
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index 9c96ee62d5a35f..4b0e2a481cd0d2 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -115,6 +115,57 @@ ENTRY e {
                             /*slice_limit=*/3, ElementsAre(3))));
 }
 
+TEST_F(TritonDotAnalysisTest, DoNotRemoveTrivialDimensionForDot) {
+  const std::string hlo_text = R"(
+HloModule t, is_scheduled=true
+
+triton_dot {
+  param_0.1 = f32[137,115]{1,0} parameter(0)
+  param_1.1 = f32[1,115]{1,0} parameter(1)
+  ROOT dot = f32[137,1]{1,0} dot(param_0.1, param_1.1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[137,115]{1,0} parameter(0)
+  p1 = f32[1,115]{1,0} parameter(1)
+  ROOT custom-call = f32[137,1]{1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  const HloComputation* dot_computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const HloInstruction* p0 = dot_computation->parameter_instruction(0);
+  const HloInstruction* p1 = dot_computation->parameter_instruction(1);
+  TF_ASSERT_OK_AND_ASSIGN(const auto analysis,
+                          TritonFusionAnalysis::Execute(*dot_computation));
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::LHS).begin(),
+            p0);
+  EXPECT_EQ(*analysis.ScopeParameters(TritonFusionAnalysis::Scope::RHS).begin(),
+            p1);
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 0),
+      ElementsAre(FieldsAre(/*stride=*/115, /*count=*/137, /*slice_start=*/0,
+                            /*slice_limit=*/137, ElementsAre(137))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::LHS, p0, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/115, /*slice_start=*/0,
+                            /*slice_limit=*/115, ElementsAre(115))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 0),
+      ElementsAre(FieldsAre(/*stride=*/115, /*count=*/1, /*slice_start=*/0,
+                            /*slice_limit=*/1, ElementsAre(1))));
+  EXPECT_THAT(
+      *analysis.IterSpec(TritonFusionAnalysis::Scope::RHS, p1, 1),
+      ElementsAre(FieldsAre(/*stride=*/1, /*count=*/115, /*slice_start=*/0,
+                            /*slice_limit=*/115, ElementsAre(115))));
+}
+
 TEST_F(TritonDotAnalysisTest, Merge) {
   const std::string hlo_text = R"(
 HloModule t
@@ -687,9 +738,11 @@ ENTRY e {
               ElementsAre(FieldsAre(/*stride=*/1, /*count=*/97,
                                     /*slice_start=*/0, /*slice_limit=*/97,
                                     /*subfragments=*/ElementsAre(97))));
-  EXPECT_EQ(analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
-                              computation->root_instruction(), 1),
-            nullptr);
+  EXPECT_THAT(*analysis.IterSpec(TritonFusionAnalysis::Scope::OUTPUT,
+                                 computation->root_instruction(), 1),
+              ElementsAre(FieldsAre(/*stride=*/97, /*count=*/1,
+                                    /*slice_start=*/0, /*slice_limit=*/1,
+                                    /*subfragments=*/ElementsAre(1))));
 }
 
 TEST_F(TritonSoftmaxAnalysisTest, BroadcastIntoBatchDimensionIsSupported) {
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index a4c837c182f13b..66631dbd19ad3d 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -20,11 +20,15 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace gpu {
@@ -50,6 +54,7 @@ bool IsDistributiveOverAddition(const HloInstruction& hlo) {
 // BF16 is supported in a sense that all operations on it are implemented
 // through F32 and converts have to be inserted into the HLO graph, but
 // they can be missing during fusion.
+// TODO(b/266862493): Support more data types (F8, F64, etc.).
 bool IsTritonSupportedDataType(PrimitiveType type,
                                const se::GpuComputeCapability& gpu_version) {
   switch (type) {
@@ -61,15 +66,13 @@ bool IsTritonSupportedDataType(PrimitiveType type,
     case F32:
       return true;
     case BF16:
-      return std::visit(
-          VariantVisitor{[](const se::CudaComputeCapability& cc) {
-                           return cc.IsAtLeast(
-                               stream_executor::CudaComputeCapability::AMPERE);
-                         },
-                         [](const se::RocmComputeCapability& cc) {
-                           return cc.has_bf16_dtype_support();
-                         }},
-          gpu_version);
+      return std::visit(VariantVisitor{[](const se::CudaComputeCapability& cc) {
+                                         return true;
+                                       },
+                                       [](const se::RocmComputeCapability& cc) {
+                                         return cc.has_bf16_dtype_support();
+                                       }},
+                        gpu_version);
     default:
       return false;
   }
@@ -132,5 +135,218 @@ bool IsTritonSupportedElementwise(HloOpcode opcode,
                                opcode);
 }
 
+CodegenDecision CanTritonHandleElementwise(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
+  if (!IsTritonSupportedDataType(instr.shape().element_type(), gpu_version)) {
+    return "Unsupported output data type.";
+  }
+
+  for (const HloInstruction* operand : instr.operands()) {
+    if (!IsTritonSupportedDataType(operand->shape().element_type(),
+                                   gpu_version)) {
+      return "Unsupported input data type.";
+    }
+  }
+
+  if (instr.opcode() == HloOpcode::kConstant) {
+    return CodegenDecision{};
+  } else if (!IsTritonSupportedElementwise(
+                 instr.opcode(), instr.operand(0)->shape().element_type())) {
+    return "Unsupported elementwise operation.";
+  }
+  return CodegenDecision{};
+}
+
+bool IsDotAlgorithmSupportedByTriton(
+    PrecisionConfig::Algorithm algorithm,
+    const se::GpuComputeCapability& gpu_version) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+  switch (algorithm) {
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      if (cuda_compute_capability) {
+        return true;
+      }
+      return false;
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      if (cuda_compute_capability) {
+        return true;
+      }
+      if (rocm_compute_capability) {
+        return rocm_compute_capability->has_bf16_dtype_support();
+      }
+      return false;
+
+    // TODO(b/326579472): Fix the support of this algorithm and maybe allow it
+    // here.
+    case PrecisionConfig::ALG_DOT_F16_F16_F32:
+    // TODO(b/311331155): Triton F32 is about 3x slower than Triton TF32 and is
+    // slow to compile. Disable it for now.
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+    default:
+      return false;
+  }
+}
+
+// Filters GEMMs which can be handled using Triton.
+CodegenDecision CanTritonHandleGEMM(
+    const HloDotInstruction& dot, const se::GpuComputeCapability& gpu_version) {
+  auto cuda_compute_capability =
+      std::get_if<se::CudaComputeCapability>(&gpu_version);
+  auto rocm_compute_capability =
+      std::get_if<se::RocmComputeCapability>(&gpu_version);
+
+  CHECK(cuda_compute_capability || rocm_compute_capability);
+
+  if (dot.precision_config().algorithm() == PrecisionConfig::ALG_UNSET) {
+    if (!tsl::tensor_float_32_execution_enabled() ||
+        absl::c_any_of(dot.precision_config().operand_precision(),
+                       [](int x) { return x != PrecisionConfig::DEFAULT; })) {
+      return "Having non-default operand precisions or TensorFloat-32 disabled "
+             "for Dot op with unset algorithm.";
+    }
+  } else {
+    if (!IsDotAlgorithmSupportedByTriton(dot.precision_config().algorithm(),
+                                         gpu_version)) {
+      return "Unsupported algorithm on the current device(s).";
+    }
+  }
+
+  auto supported_output_type = [&](const PrimitiveType t) {
+    switch (t) {
+      case F16:
+      case F32:
+        return true;
+      case BF16:
+        if (cuda_compute_capability) {
+          return true;
+        }
+        if (rocm_compute_capability) {
+          return rocm_compute_capability->has_bf16_dtype_support();
+        }
+        return false;
+      default:
+        return false;
+    }
+  };
+
+  // TODO(b/266862493): Support more output types.
+  if (!supported_output_type(dot.shape().element_type())) {
+    return "Unsupported output data type for Dot op.";
+  }
+
+  if (!IsTritonSupportedDataType(dot.operand(0)->shape().element_type(),
+                                 gpu_version) ||
+      !IsTritonSupportedDataType(dot.operand(1)->shape().element_type(),
+                                 gpu_version)) {
+    return "Unsupported input data type for Dot op.";
+  }
+
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+
+  // TODO(b/269580541): support multiple batch dimensions.
+  if (dim_numbers.lhs_batch_dimensions().size() > 1) {
+    return "Multiple batch dimensions.";
+  }
+
+  // Cases where lhs or rhs have no non-contracting dims are not handled.
+  if (dim_numbers.lhs_batch_dimensions().size() +
+              dim_numbers.lhs_contracting_dimensions().size() ==
+          dot.operand(0)->shape().rank() ||
+      dim_numbers.rhs_batch_dimensions().size() +
+              dim_numbers.rhs_contracting_dimensions().size() ==
+          dot.operand(1)->shape().rank()) {
+    return "No non-contracting dimensions.";
+  }
+
+  return CodegenDecision{};
+}
+
+// Filters Reduces which can be handled using Triton.
+CodegenDecision CanTritonHandleReduce(
+    const HloReduceInstruction& reduce,
+    const se::GpuComputeCapability& gpu_version) {
+  if (!IsTritonSupportedDataType(reduce.shape().element_type(), gpu_version)) {
+    return "Unsupported output data type for Reduce op.";
+  }
+
+  for (const HloInstruction* operand : reduce.operands()) {
+    if (!IsTritonSupportedDataType(operand->shape().element_type(),
+                                   gpu_version)) {
+      return "Unsupported input data type for Reduce op.";
+    }
+  }
+
+  bool is_triton_supported_reduction_computation = [&]() {
+    return absl::c_all_of(
+        reduce.to_apply()->instructions(), [&](const HloInstruction* instr) {
+          return IsTritonSupportedInstruction(*instr, gpu_version);
+        });
+  }();
+  if (!is_triton_supported_reduction_computation) {
+    return "Unsupported reduction computation by Triton.";
+  }
+
+  if (reduce.dimensions().size() == 1 &&
+      reduce.dimensions().front() == reduce.operand(0)->shape().rank() - 1 &&
+      reduce.operand_count() == 2) {
+    const HloInstruction* operand = reduce.operand(1);
+    // We assume that the reduction init value was input as a constant, or in
+    // the case of a data type affected by float normalization, a convert of a
+    // constant.
+    if (operand->opcode() == HloOpcode::kConvert) {
+      if (operand->operand(0)->opcode() == HloOpcode::kConstant &&
+          operand->operand(0)->shape().element_type() == BF16 &&
+          operand->shape().element_type() == F32) {
+        return CodegenDecision{};
+      }
+    } else if (operand->opcode() == HloOpcode::kConstant) {
+      return CodegenDecision{};
+    }
+    return "Reduction init value should be a constant or a convert of a "
+           "constant.";
+  }
+  return "Reduction is not a row-reduction of a single operand.";
+}
+
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version) {
+  if (instr.IsElementwise()) {
+    return CanTritonHandleElementwise(instr, gpu_version);
+  }
+
+  switch (instr.opcode()) {
+    case HloOpcode::kDot: {
+      return CanTritonHandleGEMM(*Cast<HloDotInstruction>(&instr), gpu_version);
+    }
+    case HloOpcode::kReduce: {
+      return CanTritonHandleReduce(*Cast<HloReduceInstruction>(&instr),
+                                   gpu_version);
+    }
+    case HloOpcode::kTuple: {
+      if (instr.IsRoot()) {
+        return CodegenDecision{};
+      }
+      return "Only supports root tuples.";
+    }
+    case HloOpcode::kBitcast:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kSlice:
+    case HloOpcode::kReshape:
+    case HloOpcode::kPad:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kParameter:
+    case HloOpcode::kBroadcast:
+      return CodegenDecision{};
+    default:
+      break;
+  }
+  return "Unsupported opcode.";
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.h b/third_party/xla/xla/service/gpu/triton_support.h
index 02f6da34089f89..072c9ab948ec00 100644
--- a/third_party/xla/xla/service/gpu/triton_support.h
+++ b/third_party/xla/xla/service/gpu/triton_support.h
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
+using CodegenDecision = FusionDecision;
 
 // Tells if f(a+b) == f(a) + f(b).
 bool IsDistributiveOverAddition(const HloInstruction& hlo);
@@ -46,6 +48,10 @@ bool IsTritonSupportedDataType(PrimitiveType, const se::GpuComputeCapability&);
 // Checks elementwise operation against all supported by Triton GEMM codegen.
 bool IsTritonSupportedElementwise(HloOpcode, PrimitiveType);
 
+// Checks instruction against requirements of triton emitter.
+CodegenDecision IsTritonSupportedInstruction(
+    const HloInstruction& instr, const se::GpuComputeCapability& gpu_version);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/triton_support_test.cc b/third_party/xla/xla/service/gpu/triton_support_test.cc
new file mode 100644
index 00000000000000..e3ad43b2f0f783
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_support_test.cc
@@ -0,0 +1,940 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_support.h"
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "llvm/IR/Module.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/primitive_util.h"
+#include "xla/service/float_normalization.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/gpu_float_support.h"
+#include "xla/service/gpu/ir_emitter_triton.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
+#include "xla/service/hlo_pass_pipeline.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class TritonSupportTest : public GpuCodegenTest {
+ public:
+  se::CudaComputeCapability GetCudaComputeCapability() {
+    return backend()
+        .default_stream_executor()
+        ->GetDeviceDescription()
+        .cuda_compute_capability();
+  }
+  absl::StatusOr<bool> ApplyFloatNormalization(HloModule* module) {
+    const GpuFloatSupport bf16_support(GetCudaComputeCapability(), BF16);
+    HloPassPipeline pipeline("hlo float normalization");
+    pipeline.AddPass<FloatNormalization>(&bf16_support);
+    return pipeline.Run(module);
+  }
+
+  float getTolerance(PrimitiveType data_type) {
+    float tolerance;
+    switch (data_type) {
+      case F64:
+      case F32:
+        tolerance = 1e-6;
+        break;
+      case F16:
+        tolerance = 2e-4;
+        break;
+      case BF16:
+        tolerance = 2e-2;
+        break;
+      case PRED:
+      case S8:
+        tolerance = 3e-2;
+        break;
+      case S16:
+        tolerance = 3e-3;
+        break;
+      case S32:
+        tolerance = 3e-3;
+        break;
+      default:
+        ABSL_UNREACHABLE();
+    }
+    return tolerance;
+  }
+
+ protected:
+  llvm::LLVMContext llvm_ctx_;
+  llvm::Module llvm_module_{"module", llvm_ctx_};
+  mlir::MLIRContext mlir_context_;
+  TritonGemmConfig config_{16, 32, 512, 1, 4, 8};
+};
+
+class TritonSupportTestWithParam : public TritonSupportTest,
+                                   public ::testing::WithParamInterface<
+                                       std::tuple<PrimitiveType, HloOpcode>> {};
+
+std::string TestParamsToString(
+    const ::testing::TestParamInfo<std::tuple<PrimitiveType, HloOpcode>>&
+        data) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = data.param;
+  return absl::StrCat(
+      primitive_util::LowercasePrimitiveTypeName(data_type), "_",
+      absl::StrReplaceAll(HloOpcodeString(opcode), {{"-", "_"}}));
+}
+
+using UnaryElementwiseTest = TritonSupportTestWithParam;
+
+// TODO(b/331636835): updates elementwise op tests to directly emit single op
+// instead of relying on triton gemm kernel.
+TEST_P(UnaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForUnary) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[15,33]{1,0} parameter(0)
+  parameter_1 = $0[33,68]{1,0} parameter(1)
+  unary = $0[33,68]{1,0} $1(parameter_1)
+  convert = f32[33,68]{1,0} convert(unary)
+  ROOT dot = f32[15,68]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[15,33]{1,0} parameter(0)
+  parameter_1 = $0[33,68]{1,0} parameter(1)
+  ROOT triton_gemm = f32[15,68]{1,0} fusion(parameter_0, parameter_1),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    // TODO(b/331632717): update the check to use SymbolicTileAnalysis to avoid
+    // tiling failures and check triton emitter fails gracefully.
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                tsl::testing::StatusIs(
+                    absl::StatusCode::kFailedPrecondition,
+                    ::testing::HasSubstr(
+                        "Can not propagate dim orders and requirements")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    UnaryElementwiseTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kConvert, HloOpcode::kAbs,
+                                         HloOpcode::kNegate)),
+    TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    UnaryPREDTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(PRED),
+                       ::testing::Values(HloOpcode::kConvert, HloOpcode::kNot)),
+    TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    UnaryMathTestSuite, UnaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kCos, HloOpcode::kExp,
+                                         HloOpcode::kExpm1, HloOpcode::kLog,
+                                         HloOpcode::kLog1p, HloOpcode::kRsqrt,
+                                         HloOpcode::kSin, HloOpcode::kSqrt,
+                                         HloOpcode::kCbrt, HloOpcode::kTan,
+                                         HloOpcode::kTanh, HloOpcode::kErf)),
+    TestParamsToString);
+
+using BinaryElementwiseTest = TritonSupportTestWithParam;
+
+TEST_P(BinaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForBinaryE) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  binary = $0[11,63]{1,0} $1(parameter_1, parameter_2)
+  convert = f32[11,63]{1,0} convert(binary)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+                ::testing::AnyOf(
+                    tsl::testing::StatusIs(
+                        absl::StatusCode::kInternal,
+                        ::testing::HasSubstr(
+                            "std::holds_alternative<DimOrdersAndReqs>")),
+                    tsl::testing::StatusIs(
+                        absl::StatusCode::kFailedPrecondition,
+                        ::testing::HasSubstr(
+                            "Can not propagate dim orders and requirements"))));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BinaryElementwiseTestSuite, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kAdd, HloOpcode::kMultiply,
+                                         HloOpcode::kMaximum,
+                                         HloOpcode::kMinimum,
+                                         HloOpcode::kSubtract)),
+    TestParamsToString);
+
+INSTANTIATE_TEST_SUITE_P(BinaryPREDTestSuite, BinaryElementwiseTest,
+                         ::testing::Combine(::testing::Values(PRED),
+                                            ::testing::Values(HloOpcode::kAnd,
+                                                              HloOpcode::kOr,
+                                                              HloOpcode::kXor)),
+                         TestParamsToString);
+INSTANTIATE_TEST_SUITE_P(
+    BinaryMathTestSuite, BinaryElementwiseTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kAtan2, HloOpcode::kDivide,
+                                         HloOpcode::kPower)),
+    TestParamsToString);
+
+using CompareTest = TritonSupportTestWithParam;
+
+TEST_P(CompareTest, IsTritonSupportedExecutesCorrectlyForCompare) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  compare = pred[11,63]{1,0} $1(parameter_1, parameter_2), direction=GE
+  convert = f32[11,63]{1,0} convert(compare)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  parameter_2 = $0[11,63]{1,0} parameter(2)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(
+        TritonFusionAnalysis::Execute(*computation),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("std::holds_alternative<DimOrdersAndReqs>")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CompareTestSuite, CompareTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kCompare)),
+    TestParamsToString);
+
+using TernaryElementwiseTest = TritonSupportTestWithParam;
+
+TEST_P(TernaryElementwiseTest, IsTritonSupportedExecutesCorrectlyForTernary) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  ternary = $0[13,63]{1,0} $1(parameter_3, parameter_1, parameter_2)
+  convert = f32[13,63]{1,0} convert(ternary)
+  ROOT dot = f32[92,63]{1,0} dot(parameter_0, convert),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
+    operand_precision={HIGH, HIGH}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,13]{1,0} parameter(0)
+  parameter_1 = $0[13,63]{1,0} parameter(1)
+  parameter_2 = $0[13,63]{1,0} parameter(2)
+  parameter_3 = pred[13,63]{1,0} parameter(3)
+  ROOT triton_gemm = f32[92,63]{1,0} fusion(parameter_0, parameter_1, parameter_2, parameter_3),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    EXPECT_THAT(
+        TritonFusionAnalysis::Execute(*computation),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("std::holds_alternative<DimOrdersAndReqs>")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    TernaryElementwiseTestSuite, TernaryElementwiseTest,
+    ::testing::Combine(::testing::Values(PRED, S8, S16, S32, F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kSelect)),
+    TestParamsToString);
+
+using DotTest = TritonSupportTestWithParam;
+
+TEST_P(DotTest, IsTritonSupportedExecutesCorrectlyForDot) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+triton_gemm___computation {
+  parameter_0 = $0[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  ROOT dot = $0[92,63]{1,0} $1(parameter_0, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = $0[92,11]{1,0} parameter(0)
+  parameter_1 = $0[11,63]{1,0} parameter(1)
+  ROOT triton_gemm = $0[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+  } else {
+    const se::DeviceDescription dev_info =
+        TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+    EXPECT_THAT(
+        TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                      computation, GetCudaComputeCapability(), dev_info,
+                      config_, &llvm_module_, &EmitMatMul, mlir_context_),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("Failed to compile Triton kernel")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(DotTestTestSuite, DotTest,
+                         ::testing::Combine(::testing::Values(F16, F32, BF16),
+                                            ::testing::Values(HloOpcode::kDot)),
+                         TestParamsToString);
+
+TEST_F(TritonSupportTest, UnsupportedDotOutputTypeFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  ROOT dot = pred[92,63]{1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = f32[92,11]{1,0} parameter(0)
+  parameter_1 = f32[11,63]{1,0} parameter(1)
+  ROOT triton_gemm = pred[92,63]{1,0} fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Unsupported output data type for Dot op."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitMatMul, mlir_context_),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kInternal,
+          ::testing::HasSubstr("pm.run(triton_module.get()).succeeded()")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedDotWithMultipleBatchDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
+  ROOT dot = f32[2,2,2,2]{3,2,1,0} dot(parameter_0, parameter_1),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0}, rhs_contracting_dims={2},
+    rhs_batch_dims={1,0}
+}
+
+ENTRY e {
+  parameter_0 = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  parameter_1 = f32[2,2,2,2]{3,2,1,0} parameter(1)
+  ROOT triton_gemm = f32[2,2,2,2]{3,2,1,0} fusion(parameter_0, parameter_1),
+    kind=kCustom, calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Multiple batch dimensions"));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitMatMul, mlir_context_),
+      tsl::testing::StatusIs(absl::StatusCode::kInternal,
+                             ::testing::HasSubstr("num_batch_dims <= 1")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedDotWithNoNonContractingDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+triton_gemm___computation {
+  parameter_0 = f32[2]{0} parameter(0)
+  parameter_1 = f32[2]{0} parameter(1)
+  ROOT dot = f32[] dot(parameter_0, parameter_1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  parameter_0 = f32[2]{0} parameter(0)
+  parameter_1 = f32[2]{0} parameter(1)
+  ROOT triton_gemm = f32[] fusion(parameter_0, parameter_1), kind=kCustom,
+    calls=triton_gemm___computation,
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_gemm___computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("No non-contracting dimensions."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kInternal,
+                  ::testing::HasSubstr("non_contracting_dims.size() == 1")));
+}
+
+using ReduceConstTest = TritonSupportTestWithParam;
+TEST_P(ReduceConstTest,
+       IsTritonSupportedExecutesCorrectlyForReduceWithConstInit) {
+  PrimitiveType data_type;
+  HloOpcode opcode;
+  std::tie(data_type, opcode) = GetParam();
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE) &&
+      data_type == BF16) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+
+  const std::string kHloTestTemplate = R"(
+HloModule t
+add {
+  Arg_0 = $0[] parameter(0)
+  Arg_1 = $0[] parameter(1)
+  ROOT add = $0[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = $0[125,127]{1,0} parameter(0)
+  multiply_0 = $0[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = $0[] constant(0)
+  reduce = $0[125]{0} $1(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast = $0[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = $0[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = $0[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = $0[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                           {"kind":"__triton_softmax"}}
+})";
+  const std::string hlo_test = absl::Substitute(
+      kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
+      HloOpcodeString(opcode));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_test));
+
+  const HloComputation* computation =
+      module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr =
+      hlo_query::GetFirstInstructionWithOpcode(*computation, opcode);
+  if (IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())) {
+    float tolerance = getTolerance(data_type);
+    EXPECT_OK(ApplyFloatNormalization(module.get()));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(
+        std::move(module), ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
+  } else {
+    const se::DeviceDescription dev_info =
+        TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+    EXPECT_THAT(
+        TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                      computation, GetCudaComputeCapability(), dev_info,
+                      config_, &llvm_module_, &EmitSoftMax, mlir_context_),
+        tsl::testing::StatusIs(
+            absl::StatusCode::kInternal,
+            ::testing::HasSubstr("Failed to compile Triton kernel")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ReduceConstTestSuite, ReduceConstTest,
+    ::testing::Combine(::testing::Values(F16, F32, BF16),
+                       ::testing::Values(HloOpcode::kReduce)),
+    TestParamsToString);
+
+TEST_F(TritonSupportTest,
+       SupportedReduceWithConvertConstantIsCodegenedSuccessfullyWithTriton) {
+  if (!GetCudaComputeCapability().IsAtLeast(
+          se::CudaComputeCapability::AMPERE)) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = bf16[] constant(0)
+  convert_0 = f32[] convert(constant_0)
+  reduce = f32[125]{0} reduce(multiply_0, convert_0), dimensions={1}, to_apply=add
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0), kind=kCustom,
+  calls=triton_softmax_computation,
+                        backend_config={"fusion_backend_config":
+                        {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_TRUE(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .CanFuse());
+  EXPECT_OK(ApplyFloatNormalization(hlo_module.get()));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      std::move(hlo_module), ErrorSpec{/*aabs=*/2e-4, /*arel=*/2e-4}));
+}
+
+TEST_F(
+    TritonSupportTest,
+    UnsupportedReduceWithMoreThanOneReduceDimensionsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  multiply_0 = f32[2,125,127]{2,1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[2]{0} reduce(multiply_0, constant_0), dimensions={1,2}, to_apply=add
+  broadcast = f32[2,125,127]{2,1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[2,125,127]{2,1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  ROOT triton_softmax = f32[2,125,127]{2,1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                            {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr(
+                  "Reduction is not a row-reduction of a single operand."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithNoneLastReduceDimensionFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  multiply_0 = f32[2,125,127]{2,1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[2,127]{1,0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=add
+  broadcast = f32[2,125,127]{2,1,0} broadcast(reduce), dimensions={0,2}
+  ROOT multiply = f32[2,125,127]{2,1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[2,125,127]{2,1,0} parameter(0)
+  ROOT triton_softmax = f32[2,125,127]{2,1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                            {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr(
+                  "Reduction is not a row-reduction of a single operand."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithMoreThanOneOperandsFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_2 = f32[] parameter(1)
+  Arg_1 = f32[] parameter(2)
+  Arg_3 = f32[] parameter(3)
+  add_0 = f32[] add(Arg_0, Arg_2)
+  add_1 = f32[] add(Arg_1, Arg_3)
+  ROOT pair = (f32[], f32[]) tuple(add_0, add_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127] parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  tuple_0 = (f32[125]{0}, f32[125]{0}) reduce(multiply_0, multiply_0, constant_0, constant_0), dimensions={1}, to_apply=add
+  reduce = f32[125]{0} get-tuple-element(tuple_0), index=0
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                           {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  EXPECT_THAT(
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported output data type for Reduce op."));
+  EXPECT_THAT(TritonFusionAnalysis::Execute(*computation),
+              tsl::testing::StatusIs(
+                  absl::StatusCode::kFailedPrecondition,
+                  ::testing::HasSubstr(
+                      "Can not propagate dim orders and requirements")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReduceWithNonConstReduceValueFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+add {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0, Arg_1)
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  init = f32[] parameter(1)
+  reduce = f32[125]{0} reduce(multiply_0, init), dimensions={1}, to_apply=add
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  parameter_1 = f32[] parameter(1)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0, parameter_1),
+                          kind=kCustom, calls=triton_softmax_computation,
+                        backend_config={"fusion_backend_config":
+                                         {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+                  .Explain(),
+              ::testing::HasSubstr("Reduction init value should be a constant "
+                                   "or a convert of a constant."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitSoftMax, mlir_context_),
+      tsl::testing::StatusIs(
+          absl::StatusCode::kInternal,
+          ::testing::HasSubstr("operand->opcode() == HloOpcode::kConstant")));
+}
+
+TEST_F(TritonSupportTest,
+       UnsupportedReductionComputationFailsGracefullyWithTriton) {
+  const std::string kHloTest = R"(
+HloModule t
+custom_call {
+  Arg_0 = f32[] parameter(0)
+  Arg_1 = f32[] parameter(1)
+  ROOT custom_call = f32[] custom-call(Arg_0, Arg_1), custom_call_target="foo"
+}
+
+triton_softmax_computation {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  multiply_0 = f32[125,127]{1,0} multiply(parameter_0, parameter_0)
+  constant_0 = f32[] constant(0)
+  reduce = f32[125]{0} reduce(multiply_0, constant_0), dimensions={1}, to_apply=custom_call
+  broadcast = f32[125,127]{1,0} broadcast(reduce), dimensions={0}
+  ROOT multiply = f32[125,127]{1,0} multiply(multiply_0, broadcast)
+}
+
+ENTRY main {
+  parameter_0 = f32[125,127]{1,0} parameter(0)
+  ROOT triton_softmax = f32[125,127]{1,0} fusion(parameter_0),
+                          kind=kCustom, calls=triton_softmax_computation,
+                          backend_config={"fusion_backend_config":
+                                         {"kind":"__triton_softmax"}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(kHloTest));
+
+  const HloComputation* computation =
+      hlo_module->GetComputationWithName("triton_softmax_computation");
+  ASSERT_TRUE(computation != nullptr);
+  const HloInstruction* instr = hlo_query::GetFirstInstructionWithOpcode(
+      *computation, HloOpcode::kReduce);
+  const se::DeviceDescription dev_info =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(GetCudaComputeCapability());
+  EXPECT_THAT(
+      IsTritonSupportedInstruction(*instr, GetCudaComputeCapability())
+          .Explain(),
+      ::testing::HasSubstr("Unsupported reduction computation by Triton."));
+  EXPECT_THAT(
+      TritonWrapper(*TritonFusionAnalysis::Execute(*computation), "test_fn",
+                    computation, GetCudaComputeCapability(), dev_info, config_,
+                    &llvm_module_, &EmitSoftMax, mlir_context_),
+      tsl::testing::StatusIs(absl::StatusCode::kInvalidArgument,
+                             ::testing::HasSubstr("Unsupported operation")));
+}
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 123032dd955cfe..89b54ed4b4a4fa 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -49,6 +49,27 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+
+// The input is a map from dimension index to DimIterationSpec. The function
+// removes dimensions that have a trivial DimIterationSpec.
+absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>
+FilterTrivialDims(
+    const absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>&
+        dim_iter_specs) {
+  absl::flat_hash_map<int, TensorIterationSpec::DimIterationSpec>
+      non_trivial_dim_iteration_specs;
+  for (const auto& [dim, dim_spec] : dim_iter_specs) {
+    if (dim_spec.size() == 1 && dim_spec[0].count == 1) {
+      continue;
+    }
+    non_trivial_dim_iteration_specs[dim] = dim_spec;
+  }
+  return non_trivial_dim_iteration_specs;
+}
+
+}  // namespace
+
 const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
     const int dimension) const {
   if (auto it = dim_iteration_specs_.find(dimension);
@@ -58,16 +79,34 @@ const TensorIterationSpec::DimIterationSpec* TensorIterationSpec::Find(
   return nullptr;
 }
 
+std::vector<int> TensorIterationSpec::GetDimensions() const {
+  std::vector<int> result;
+  result.reserve(dim_iteration_specs_.size());
+  for (const auto& [dim, _] : dim_iteration_specs_) {
+    result.push_back(dim);
+  }
+  return result;
+}
+
 bool TensorIterationSpec::IsPhysicallyEquivalent(
     const TensorIterationSpec& other) const {
-  if (dim_iteration_specs_.size() != other.dim_iteration_specs_.size()) {
+  // Filter out trivial dims since they don't affect physical representation.
+  const absl::flat_hash_map<int, DimIterationSpec>
+      non_trivial_dim_iteration_specs = FilterTrivialDims(dim_iteration_specs_);
+  const absl::flat_hash_map<int, DimIterationSpec>
+      other_non_trivial_dim_iteration_specs =
+          FilterTrivialDims(other.dim_iteration_specs_);
+
+  if (non_trivial_dim_iteration_specs.size() !=
+      other_non_trivial_dim_iteration_specs.size()) {
     return false;
   }
-  for (const auto& pair : dim_iteration_specs_) {
+
+  for (const auto& pair : non_trivial_dim_iteration_specs) {
     int dimension = pair.first;
     const DimIterationSpec& dim_iter_spec = pair.second;
-    auto other_it = other.dim_iteration_specs_.find(dimension);
-    if (other_it == other.dim_iteration_specs_.end()) {
+    auto other_it = other_non_trivial_dim_iteration_specs.find(dimension);
+    if (other_it == other_non_trivial_dim_iteration_specs.end()) {
       return false;
     }
     const DimIterationSpec& other_dim_iter_spec = other_it->second;
@@ -172,12 +211,6 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
   TensorIterationSpec tensor_spec;
   int64_t accumulated_stride = 1;
   int last_dim = -1;
-  auto remove_last_fragment_if_degenerate = [&tensor_spec](const int dim_idx) {
-    if (dim_idx >= 0 && !tensor_spec[dim_idx].empty() &&
-        tensor_spec[dim_idx].back().count == 1) {
-      tensor_spec[dim_idx].pop_back();
-    }
-  };
   for (int dim_order_index = 0; dim_order_index < dim_fragments.size();
        ++dim_order_index) {
     const DimensionOrder::Fragment& fragment = dim_fragments[dim_order_index];
@@ -204,7 +237,6 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
         dim_spec.back().subfragments.push_back(fragment.sliced_count());
       }
     } else {
-      remove_last_fragment_if_degenerate(last_dim);
       // Add part of the dimension.
       dim_spec.push_back(TensorIterationSpec::IterationSpecFragment{
           accumulated_stride,
@@ -217,7 +249,23 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
     accumulated_stride *= fragment.full_count();
     last_dim = fragment.dst_dim_number();
   }
-  remove_last_fragment_if_degenerate(last_dim);
+
+  // Remove degenerate fragments.
+  for (int dim_idx : tensor_spec.GetDimensions()) {
+    TensorIterationSpec::DimIterationSpec& dim_spec = tensor_spec[dim_idx];
+
+    // We should not remove the only fragment in a dimension, because if it is
+    // removed, the dimension will be removed from the TensorIterationSpec.
+    if (dim_spec.size() <= 1) continue;
+
+    TensorIterationSpec::DimIterationSpec filtered_dim_spec;
+    absl::c_copy_if(dim_spec, std::back_inserter(filtered_dim_spec),
+                    [](const TensorIterationSpec::IterationSpecFragment& f) {
+                      return f.count != 1;
+                    });
+    tensor_spec[dim_idx] = filtered_dim_spec;
+  }
+
   tensor_spec.RemoveEmptyDimensions();
   return tensor_spec;
 }
@@ -1037,10 +1085,7 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
       std::move(std::get<DimOrdersAndReqs>(result_or_error));
   int fusion_level =
       hlo.GetModule()->config().debug_options().xla_gpu_triton_fusion_level();
-  if (!std::get<se::CudaComputeCapability>(gpu_version)
-           .IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-    fusion_level = std::min(fusion_level, 1);
-  }
+  // TODO(ROCm): Check fusion level for ROCm.
   if (transform_direction == TransformDirection::kOutputToInput) {
     if (fusion_level < 2) {
       if (hlo.opcode() == HloOpcode::kConvert) {
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index 5fc2d952d2453b..87ff11ae7c7415 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -114,6 +114,8 @@ class TensorIterationSpec {
   // Returns nullptr if not found.
   const DimIterationSpec* Find(int dimension) const;
 
+  std::vector<int> GetDimensions() const;
+
   void RemoveEmptyDimensions() {
     absl::erase_if(dim_iteration_specs_,
                    [](const auto& it) { return it.second.empty(); });
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
new file mode 100644
index 00000000000000..515bffbe0eb644
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_tiling_propagation.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla::gpu {
+namespace {
+
+using TritonTilingPropagationTest = HloTestBase;
+using triton_fusion::DimensionOrder;
+
+DimensionOrder FromFragments(DimensionOrder::Fragments fragments) {
+  DimensionOrder dim_order;
+  DimensionOrder::Fragments& tensor_fragments_order =
+      dim_order.TensorFragmentsOrder();
+  DimensionOrder::FragmentOrders& dim_fragments_orders =
+      dim_order.DimFragmentsOrders();
+  for (const DimensionOrder::Fragment& fragment : fragments) {
+    tensor_fragments_order.push_back(fragment);
+    dim_fragments_orders[fragment.dst_dim_number()].push_back(
+        tensor_fragments_order.size());
+  }
+  return dim_order;
+}
+
+TEST_F(
+    TritonTilingPropagationTest,
+    DimensionOrdersRemainPhysicallyEquivalentAfterInsertingTrivialDimensions) {
+  DimensionOrder::Fragment fragment_1(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_2(/*dst_dim_number=*/0, /*count=*/1);
+  DimensionOrder dimension_order_1 = FromFragments({fragment_1, fragment_2});
+
+  DimensionOrder::Fragment fragment_3(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_4(/*dst_dim_number=*/1, /*count=*/1);
+  DimensionOrder dimension_order_2 = FromFragments({fragment_3, fragment_4});
+
+  // They should be equivalent because fragment_2 and fragment_4 both have count
+  // 1, so they don't affect the physical representation.
+  EXPECT_TRUE(dimension_order_1.IsPhysicallyEquivalent(dimension_order_2));
+}
+
+TEST_F(
+    TritonTilingPropagationTest,
+    IterationSpecsRemainPhysicallyEquivalentAfterInsertingTrivialDimensions) {
+  TensorIterationSpec::IterationSpecFragment fragment_1 = {
+      /*stride=*/1, /*count=*/97, /*slice_start=*/0, /*sliced_count=*/97,
+      /*subfragments=*/{97}};
+  TensorIterationSpec spec_1;
+  spec_1[0].push_back(fragment_1);
+
+  TensorIterationSpec::IterationSpecFragment fragment_2 = {
+      /*stride=*/1, /*count=*/97, /*slice_start=*/0, /*sliced_count=*/97,
+      /*subfragments=*/{97}};
+  TensorIterationSpec::IterationSpecFragment fragment_3 = {
+      /*stride=*/97, /*count=*/1, /*slice_start=*/0, /*sliced_count=*/1,
+      /*subfragments=*/{1}};
+  TensorIterationSpec spec_2;
+  spec_2[0].push_back(fragment_2);
+  spec_2[1].push_back(fragment_3);
+
+  // spec_2's extra dimension is degenerate, so it should have the same physical
+  // representation as spec_1.
+  EXPECT_TRUE(spec_1.IsPhysicallyEquivalent(spec_2));
+}
+
+TEST_F(TritonTilingPropagationTest,
+       DimensionsShouldNotBeRemovedByToTensorIterationSpec) {
+  DimensionOrder::Fragment fragment_0(/*dst_dim_number=*/0, /*count=*/97);
+  DimensionOrder::Fragment fragment_1(/*dst_dim_number=*/1, /*count=*/1);
+  DimensionOrder dimension_order = FromFragments({fragment_0, fragment_1});
+  TensorIterationSpec spec = dimension_order.ToTensorIterationSpec();
+  const TensorIterationSpec::DimIterationSpec* dim_spec_0 = spec.Find(0);
+  EXPECT_NE(dim_spec_0, nullptr);
+  EXPECT_EQ(dim_spec_0->size(), 1);
+  EXPECT_EQ(dim_spec_0->at(0).count, 97);
+
+  const TensorIterationSpec::DimIterationSpec* dim_spec_1 = spec.Find(1);
+  EXPECT_NE(dim_spec_1, nullptr);
+  EXPECT_EQ(dim_spec_1->size(), 1);
+  EXPECT_EQ(dim_spec_1->at(0).count, 1);
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index d6551239db2a99..d598c02df3d5f6 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "xla/service/compilation_environments.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 47b8e312df5cd6..78e5a7b878df21 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -41,6 +41,7 @@ xla_cc_test(
     deps = [
         ":graphcycles",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/random",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
diff --git a/third_party/xla/xla/service/graphcycles/graphcycles.cc b/third_party/xla/xla/service/graphcycles/graphcycles.cc
index d1399062ca52eb..c8308d4d2e13f1 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "xla/service/graphcycles/graphcycles.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
@@ -48,17 +51,19 @@ namespace {
 using NodeSet = absl::flat_hash_set<int32_t>;
 using OrderedNodeSet = OrderedSet<int32_t>;
 
-template <typename T>
-struct VecStruct {
-  typedef absl::InlinedVector<T, 4> type;
-};
-template <typename T>
-using Vec = typename VecStruct<T>::type;
-
 struct Node {
   int32_t rank;        // rank number assigned by Pearce-Kelly algorithm
+  // Note (ecg@): the padding between these two fields bothered me, so I tried
+  // the following alternatives:
+  // - Separate bitmap to track visited[].
+  // - Separate std::vector<bool> visited.
+  // - Tagged top or bottom bit of "rank" to keep track of "visited".
+  // However, keeping the bool here (despite the padding) achieves the best
+  // performance for the IsReachableNonConst microbenchmark.
   bool visited;        // Temporary marker used by depth-first-search
-  void* data;          // User-supplied data
+};
+
+struct NodeIO {
   OrderedNodeSet in;   // List of immediate predecessor nodes in graph
   OrderedNodeSet out;  // List of immediate successor nodes in graph
 };
@@ -66,40 +71,43 @@ struct Node {
 }  // namespace
 
 struct GraphCycles::Rep {
-  Vec<Node*> nodes_;
-  Vec<int32_t> free_nodes_;  // Indices for unused entries in nodes_
+  std::vector<Node> nodes_;
+  std::vector<NodeIO> node_io_;
+  std::vector<int32_t> free_nodes_;  // Indices for unused entries in nodes_
 
   // Temporary state.
-  Vec<int32_t> deltaf_;  // Results of forward DFS
-  Vec<int32_t> deltab_;  // Results of backward DFS
-  Vec<int32_t> list_;    // All nodes to reprocess
-  Vec<int32_t> merged_;  // Rank values to assign to list_ entries
-  Vec<int32_t>
+  std::vector<int32_t> deltaf_;  // Results of forward DFS
+  std::vector<int32_t> deltab_;  // Results of backward DFS
+  std::vector<int32_t> list_;    // All nodes to reprocess
+  std::vector<int32_t> merged_;  // Rank values to assign to list_ entries
+  std::vector<int32_t>
       stack_;  // Emulates recursion stack when doing depth first search
+
+  // User-supplied data. Stored outside of Node since it is rarely accessed.
+  std::vector<void*> node_data_;
 };
 
 GraphCycles::GraphCycles() : rep_(new Rep) {}
 
+// Define the destructor here because Rep is also defined in this file.
 GraphCycles::~GraphCycles() {
-  for (Vec<Node*>::size_type i = 0; i < rep_->nodes_.size(); i++) {
-    delete rep_->nodes_[i];
-  }
   delete rep_;
 }
 
 bool GraphCycles::CheckInvariants() const {
   Rep* r = rep_;
   NodeSet ranks;  // Set of ranks seen so far.
-  for (Vec<Node*>::size_type x = 0; x < r->nodes_.size(); x++) {
-    Node* nx = r->nodes_[x];
+  for (size_t x = 0; x < r->nodes_.size(); x++) {
+    Node* nx = &r->nodes_[x];
     if (nx->visited) {
       LOG(FATAL) << "Did not clear visited marker on node " << x;
     }
     if (!ranks.insert(nx->rank).second) {
       LOG(FATAL) << "Duplicate occurrence of rank " << nx->rank;
     }
-    for (int32_t y : nx->out.GetSequence()) {
-      Node* ny = r->nodes_[y];
+    NodeIO* nx_io = &r->node_io_[x];
+    for (int32_t y : nx_io->out.GetSequence()) {
+      Node* ny = &r->nodes_[y];
       if (nx->rank >= ny->rank) {
         LOG(FATAL) << "Edge " << x << "->" << y << " has bad rank assignment "
                    << nx->rank << "->" << ny->rank;
@@ -111,29 +119,30 @@ bool GraphCycles::CheckInvariants() const {
 
 int32_t GraphCycles::NewNode() {
   if (rep_->free_nodes_.empty()) {
-    Node* n = new Node;
-    n->visited = false;
-    n->data = nullptr;
-    n->rank = rep_->nodes_.size();
-    rep_->nodes_.push_back(n);
-    return n->rank;
+    Node n;
+    n.visited = false;
+    n.rank = rep_->nodes_.size();
+    rep_->nodes_.emplace_back(n);
+    rep_->node_io_.emplace_back();
+    rep_->node_data_.push_back(nullptr);
+    return n.rank;
   } else {
     // Preserve preceding rank since the set of ranks in use must be
     // a permutation of [0,rep_->nodes_.size()-1].
     int32_t r = rep_->free_nodes_.back();
-    rep_->nodes_[r]->data = nullptr;
     rep_->free_nodes_.pop_back();
+    rep_->node_data_[r] = nullptr;
     return r;
   }
 }
 
 void GraphCycles::RemoveNode(int32_t node) {
-  Node* x = rep_->nodes_[node];
+  NodeIO* x = &rep_->node_io_[node];
   for (int32_t y : x->out.GetSequence()) {
-    rep_->nodes_[y]->in.Erase(node);
+    rep_->node_io_[y].in.Erase(node);
   }
   for (int32_t y : x->in.GetSequence()) {
-    rep_->nodes_[y]->out.Erase(node);
+    rep_->node_io_[y].out.Erase(node);
   }
   x->in.Clear();
   x->out.Clear();
@@ -141,20 +150,20 @@ void GraphCycles::RemoveNode(int32_t node) {
 }
 
 void* GraphCycles::GetNodeData(int32_t node) const {
-  return rep_->nodes_[node]->data;
+  return rep_->node_data_[node];
 }
 
 void GraphCycles::SetNodeData(int32_t node, void* data) {
-  rep_->nodes_[node]->data = data;
+  rep_->node_data_[node] = data;
 }
 
 bool GraphCycles::HasEdge(int32_t x, int32_t y) const {
-  return rep_->nodes_[x]->out.Contains(y);
+  return rep_->node_io_[x].out.Contains(y);
 }
 
 void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
-  rep_->nodes_[x]->out.Erase(y);
-  rep_->nodes_[y]->in.Erase(x);
+  rep_->node_io_[x].out.Erase(y);
+  rep_->node_io_[y].in.Erase(x);
   // No need to update the rank assignment since a previous valid
   // rank assignment remains valid after an edge deletion.
 }
@@ -162,23 +171,26 @@ void GraphCycles::RemoveEdge(int32_t x, int32_t y) {
 static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound);
 static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound);
 static void Reorder(GraphCycles::Rep* r);
-static void Sort(const Vec<Node*>&, Vec<int32_t>* delta);
-static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
-                       Vec<int32_t>* dst);
-static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes);
+static void Sort(absl::Span<const Node>, std::vector<int32_t>* delta);
+static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
+                       std::vector<int32_t>* dst);
+static void ClearVisitedBits(GraphCycles::Rep* r,
+                             absl::Span<const int32_t> visited_indices);
 
 bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   if (x == y) return false;
   Rep* r = rep_;
-  Node* nx = r->nodes_[x];
-  if (!nx->out.Insert(y)) {
+  NodeIO* nx_io = &r->node_io_[x];
+  if (!nx_io->out.Insert(y)) {
     // Edge already exists.
     return true;
   }
 
-  Node* ny = r->nodes_[y];
-  ny->in.Insert(x);
+  NodeIO* ny_io = &r->node_io_[y];
+  ny_io->in.Insert(x);
 
+  Node* nx = &r->nodes_[x];
+  Node* ny = &r->nodes_[y];
   if (nx->rank <= ny->rank) {
     // New edge is consistent with existing rank assignment.
     return true;
@@ -188,8 +200,8 @@ bool GraphCycles::InsertEdge(int32_t x, int32_t y) {
   // We only need to consider nodes that fall in the range [ny->rank,nx->rank].
   if (!ForwardDFS(r, y, nx->rank)) {
     // Found a cycle.  Undo the insertion and tell caller.
-    nx->out.Erase(y);
-    ny->in.Erase(x);
+    nx_io->out.Erase(y);
+    ny_io->in.Erase(x);
     // Since we do not call Reorder() on this path, clear any visited
     // markers left by ForwardDFS.
     ClearVisitedBits(r, r->deltaf_);
@@ -209,14 +221,15 @@ static bool ForwardDFS(GraphCycles::Rep* r, int32_t n, int32_t upper_bound) {
   while (!r->stack_.empty()) {
     n = r->stack_.back();
     r->stack_.pop_back();
-    Node* nn = r->nodes_[n];
+    Node* nn = &r->nodes_[n];
     if (nn->visited) continue;
 
     nn->visited = true;
     r->deltaf_.push_back(n);
 
-    for (auto w : nn->out.GetSequence()) {
-      Node* nw = r->nodes_[w];
+    NodeIO* nn_io = &r->node_io_[n];
+    for (auto w : nn_io->out.GetSequence()) {
+      Node* nw = &r->nodes_[w];
       if (nw->rank == upper_bound) {
         return false;  // Cycle
       }
@@ -235,14 +248,15 @@ static void BackwardDFS(GraphCycles::Rep* r, int32_t n, int32_t lower_bound) {
   while (!r->stack_.empty()) {
     n = r->stack_.back();
     r->stack_.pop_back();
-    Node* nn = r->nodes_[n];
+    Node* nn = &r->nodes_[n];
     if (nn->visited) continue;
 
     nn->visited = true;
     r->deltab_.push_back(n);
 
-    for (auto w : nn->in.GetSequence()) {
-      Node* nw = r->nodes_[w];
+    NodeIO* nn_io = &r->node_io_[n];
+    for (auto w : nn_io->in.GetSequence()) {
+      Node* nw = &r->nodes_[w];
       if (!nw->visited && lower_bound < nw->rank) {
         r->stack_.push_back(w);
       }
@@ -265,36 +279,31 @@ static void Reorder(GraphCycles::Rep* r) {
              r->deltaf_.end(), r->merged_.begin());
 
   // Assign the ranks in order to the collected list.
-  for (Vec<int32_t>::size_type i = 0; i < r->list_.size(); i++) {
-    r->nodes_[r->list_[i]]->rank = r->merged_[i];
+  for (size_t i = 0; i < r->list_.size(); i++) {
+    r->nodes_[r->list_[i]].rank = r->merged_[i];
   }
 }
 
-static void Sort(const Vec<Node*>& nodes, Vec<int32_t>* delta) {
-  struct ByRank {
-    const Vec<Node*>* nodes;
-    bool operator()(int32_t a, int32_t b) const {
-      return (*nodes)[a]->rank < (*nodes)[b]->rank;
-    }
-  };
-  ByRank cmp;
-  cmp.nodes = &nodes;
-  std::sort(delta->begin(), delta->end(), cmp);
+static void Sort(absl::Span<const Node> nodes, std::vector<int32_t>* delta) {
+  std::sort(delta->begin(), delta->end(), [&](int32_t a, int32_t b) {
+    return nodes[a].rank < nodes[b].rank;
+  });
 }
 
-static void MoveToList(GraphCycles::Rep* r, Vec<int32_t>* src,
-                       Vec<int32_t>* dst) {
-  for (Vec<int32_t>::size_type i = 0; i < src->size(); i++) {
+static void MoveToList(GraphCycles::Rep* r, std::vector<int32_t>* src,
+                       std::vector<int32_t>* dst) {
+  for (size_t i = 0; i < src->size(); i++) {
     int32_t w = (*src)[i];
-    (*src)[i] = r->nodes_[w]->rank;  // Replace src entry with its rank
-    r->nodes_[w]->visited = false;   // Prepare for future DFS calls
+    (*src)[i] = r->nodes_[w].rank;  // Replace src entry with its rank
+    r->nodes_[w].visited = false;   // Prepare for future DFS calls
     dst->push_back(w);
   }
 }
 
-static void ClearVisitedBits(GraphCycles::Rep* r, const Vec<int32_t>& nodes) {
-  for (Vec<int32_t>::size_type i = 0; i < nodes.size(); i++) {
-    r->nodes_[nodes[i]]->visited = false;
+static void ClearVisitedBits(GraphCycles::Rep* r,
+                             absl::Span<const int32_t> visited_indices) {
+  for (auto index : visited_indices) {
+    r->nodes_[index].visited = false;
   }
 }
 
@@ -328,7 +337,7 @@ int GraphCycles::FindPath(int32_t x, int32_t y, int max_path_len,
       return path_len;
     }
 
-    for (auto w : r->nodes_[n]->out.GetSequence()) {
+    for (auto w : r->node_io_[n].out.GetSequence()) {
       if (seen.insert(w).second) {
         r->stack_.push_back(w);
       }
@@ -345,8 +354,8 @@ bool GraphCycles::IsReachable(int32_t x, int32_t y) const {
 bool GraphCycles::IsReachableNonConst(int32_t x, int32_t y) {
   if (x == y) return true;
   Rep* r = rep_;
-  Node* nx = r->nodes_[x];
-  Node* ny = r->nodes_[y];
+  Node* nx = &r->nodes_[x];
+  Node* ny = &r->nodes_[y];
 
   if (nx->rank >= ny->rank) {
     // x cannot reach y since it is after it in the topological ordering
@@ -381,29 +390,29 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
     return std::nullopt;
   }
 
-  if (rep_->nodes_[b]->in.Size() + rep_->nodes_[b]->out.Size() >
-      rep_->nodes_[a]->in.Size() + rep_->nodes_[a]->out.Size()) {
+  if (rep_->node_io_[b].in.Size() + rep_->node_io_[b].out.Size() >
+      rep_->node_io_[a].in.Size() + rep_->node_io_[a].out.Size()) {
     // Swap "a" and "b" to minimize copying.
     std::swap(a, b);
   }
 
-  Node* nb = rep_->nodes_[b];
-  OrderedNodeSet out = std::move(nb->out);
-  OrderedNodeSet in = std::move(nb->in);
+  NodeIO* nb_io = &rep_->node_io_[b];
+  OrderedNodeSet out = std::move(nb_io->out);
+  OrderedNodeSet in = std::move(nb_io->in);
   for (int32_t y : out.GetSequence()) {
-    rep_->nodes_[y]->in.Erase(b);
+    rep_->node_io_[y].in.Erase(b);
   }
   for (int32_t y : in.GetSequence()) {
-    rep_->nodes_[y]->out.Erase(b);
+    rep_->node_io_[y].out.Erase(b);
   }
   rep_->free_nodes_.push_back(b);
 
-  rep_->nodes_[a]->out.Reserve(rep_->nodes_[a]->out.Size() + out.Size());
+  rep_->node_io_[a].out.Reserve(rep_->node_io_[a].out.Size() + out.Size());
   for (int32_t y : out.GetSequence()) {
     InsertEdge(a, y);
   }
 
-  rep_->nodes_[a]->in.Reserve(rep_->nodes_[a]->in.Size() + in.Size());
+  rep_->node_io_[a].in.Reserve(rep_->node_io_[a].in.Size() + in.Size());
   for (int32_t y : in.GetSequence()) {
     InsertEdge(y, a);
   }
@@ -413,11 +422,11 @@ std::optional<int32_t> GraphCycles::ContractEdge(int32_t a, int32_t b) {
 }
 
 absl::Span<const int32_t> GraphCycles::Successors(int32_t node) const {
-  return rep_->nodes_[node]->out.GetSequence();
+  return rep_->node_io_[node].out.GetSequence();
 }
 
 absl::Span<const int32_t> GraphCycles::Predecessors(int32_t node) const {
-  return rep_->nodes_[node]->in.GetSequence();
+  return rep_->node_io_[node].in.GetSequence();
 }
 
 std::vector<int32_t> GraphCycles::SuccessorsCopy(int32_t node) const {
@@ -431,11 +440,11 @@ std::vector<int32_t> GraphCycles::PredecessorsCopy(int32_t node) const {
 }
 
 namespace {
-void SortInPostOrder(absl::Span<Node* const> nodes,
+void SortInPostOrder(absl::Span<const Node> nodes,
                      std::vector<int32_t>* to_sort) {
   absl::c_sort(*to_sort, [&](int32_t a, int32_t b) {
-    DCHECK(a == b || nodes[a]->rank != nodes[b]->rank);
-    return nodes[a]->rank > nodes[b]->rank;
+    DCHECK(a == b || nodes[a].rank != nodes[b].rank);
+    return nodes[a].rank > nodes[b].rank;
   });
 }
 }  // namespace
@@ -458,10 +467,8 @@ std::vector<int32_t> GraphCycles::AllNodesInPostOrder() const {
 }
 
 std::string GraphCycles::DebugString() const {
-  absl::flat_hash_set<int32_t> free_nodes_set;
-  for (int32_t free_node : rep_->free_nodes_) {
-    free_nodes_set.insert(free_node);
-  }
+  absl::flat_hash_set<int32_t> free_nodes_set(rep_->free_nodes_.begin(),
+                                              rep_->free_nodes_.end());
 
   std::string result = "digraph {\n";
   for (int i = 0, end = rep_->nodes_.size(); i < end; i++) {
@@ -469,7 +476,7 @@ std::string GraphCycles::DebugString() const {
       continue;
     }
 
-    for (int32_t succ : rep_->nodes_[i]->out.GetSequence()) {
+    for (int32_t succ : rep_->node_io_[i].out.GetSequence()) {
       absl::StrAppend(&result, "  \"", i, "\" -> \"", succ, "\"\n");
     }
   }
diff --git a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
index 841b383736dcd7..0c6fa481b16c37 100644
--- a/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
+++ b/third_party/xla/xla/service/graphcycles/graphcycles_test.cc
@@ -17,11 +17,13 @@ limitations under the License.
 
 #include "xla/service/graphcycles/graphcycles.h"
 
+#include <cstdint>
 #include <optional>
 #include <random>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/random/random.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -510,7 +512,7 @@ TEST_F(GraphCyclesTest, CanContractEdge) {
 static void BM_StressTest(::testing::benchmark::State &state) {
   const int num_nodes = state.range(0);
 
-  for (auto s : state) {
+  while (state.KeepRunningBatch(num_nodes)) {
     tensorflow::GraphCycles g;
     int32_t *nodes = new int32_t[num_nodes];
     for (int i = 0; i < num_nodes; i++) {
@@ -532,7 +534,7 @@ BENCHMARK(BM_StressTest)->Range(2048, 1048576);
 static void BM_ContractEdge(::testing::benchmark::State &state) {
   const int num_nodes = state.range(0);
 
-  for (auto s : state) {
+  while (state.KeepRunningBatch(num_nodes)) {
     state.PauseTiming();
     tensorflow::GraphCycles g;
     std::vector<int32_t> nodes;
@@ -553,3 +555,50 @@ static void BM_ContractEdge(::testing::benchmark::State &state) {
   }
 }
 BENCHMARK(BM_ContractEdge)->Arg(1000)->Arg(10000);
+
+static void BM_IsReachableNonConst(testing::benchmark::State &state) {
+  const int num_nodes = state.range(0);
+
+  tensorflow::GraphCycles g;
+  std::vector<uint32_t> nodes;
+  nodes.reserve(num_nodes);
+  for (int i = 0; i < num_nodes; i++) {
+    nodes.push_back(g.NewNode());
+  }
+
+  // Add forward edges.
+  absl::BitGen bitgen;
+  for (int i = 0; i < num_nodes; i++) {
+    int max = num_nodes - 1 - i;
+    if (max == 0) break;
+    constexpr int branch_factor = 2;
+    for (int b = 0; b < branch_factor; b++) {
+      int j = i + 1 + absl::Uniform(bitgen, 0, max);
+      CHECK_LT(j, num_nodes);
+      CHECK(g.InsertEdge(nodes[i], nodes[j]));
+    }
+  }
+
+  auto get_random_node = [&]() {
+    return nodes[absl::Uniform(bitgen, 0, num_nodes)];
+  };
+
+  uint32_t src, dst;
+  int i = 0;
+  for (auto s : state) {
+    if (i % 256 == 0) {
+      src = get_random_node();
+      dst = get_random_node();
+    }
+    bool reachable = g.IsReachableNonConst(src, dst);
+    benchmark::DoNotOptimize(reachable);
+    i++;
+  }
+}
+BENCHMARK(BM_IsReachableNonConst)
+    ->Arg(10)
+    ->Arg(50)
+    ->Arg(100)
+    ->Arg(200)
+    ->Arg(1000)
+    ->Arg(30000);
diff --git a/third_party/xla/xla/service/heap_simulator/BUILD b/third_party/xla/xla/service/heap_simulator/BUILD
index 161c8f069028d3..41ebb4275290b5 100644
--- a/third_party/xla/xla/service/heap_simulator/BUILD
+++ b/third_party/xla/xla/service/heap_simulator/BUILD
@@ -1,11 +1,11 @@
 # Description:
 #   XLA Heap simulator.
 
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/hlo_constant_folding.cc b/third_party/xla/xla/service/hlo_constant_folding.cc
index 71f58d9a241232..7afdb75649edc3 100644
--- a/third_party/xla/xla/service/hlo_constant_folding.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding.cc
@@ -233,6 +233,16 @@ StatusOr<bool> HloConstantFolding::Run(
       dead_instructions.push_back(instruction);
       HloInstruction* new_constant = computation->AddInstruction(
           HloInstruction::CreateConstant(std::move(result)));
+      if (new_constant->shape().has_layout()) {
+        // Update element_size_in_bits on the new instruction's layout. Literals
+        // always have element_size_in_bits set to 0, and CreateConstant copies
+        // the shape/layout from the Literal, so we need to set
+        // element_size_in_bits here.
+        new_constant->mutable_shape()
+            ->mutable_layout()
+            ->set_element_size_in_bits(
+                instruction->shape().layout().element_size_in_bits());
+      }
       TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(new_constant));
     }
   }
diff --git a/third_party/xla/xla/service/hlo_constant_folding_test.cc b/third_party/xla/xla/service/hlo_constant_folding_test.cc
index 4150b24ead5ee1..4958bee65f54d1 100644
--- a/third_party/xla/xla/service/hlo_constant_folding_test.cc
+++ b/third_party/xla/xla/service/hlo_constant_folding_test.cc
@@ -346,6 +346,32 @@ TEST_F(HloConstantFoldingTest, FoldOpsWhereOneOperandIsBroadcast) {
                                   )));
 }
 
+TEST_F(HloConstantFoldingTest, FoldInt4Ops) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  ENTRY entry {
+    c0 = s4[2]{0:E(4)} constant({1, 2})
+    c1 = s4[2]{0:E(4)} constant({3, 4})
+    add1 = s4[2]{0:E(4)} add(c0, c1)
+    c2 = s4[]{:E(4)} constant(5)
+    add2 = s4[2]{0:E(4)} add(c0, s4[2]{0:E(4)} broadcast(c2))
+    ROOT root = tuple(add1, add2)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_TRUE(result);
+  auto is_4_bit = [](const HloInstruction* instr) {
+    return instr->shape().layout().element_size_in_bits() == 4;
+  };
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Constant().WithPredicate(is_4_bit),
+                                  m::Constant().WithPredicate(is_4_bit))));
+}
+
 TEST_F(HloConstantFoldingTest, BigReduceWindow) {
   constexpr absl::string_view kModuleStr = R"(
     HloModule test
diff --git a/third_party/xla/xla/service/hlo_cse.cc b/third_party/xla/xla/service/hlo_cse.cc
index 44f25c83b63164..46a4b52c3b9698 100644
--- a/third_party/xla/xla/service/hlo_cse.cc
+++ b/third_party/xla/xla/service/hlo_cse.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -60,7 +61,8 @@ struct ConstantKey {
 // While we're here, also combine identical iota instructions, since they need
 // similar treatment.
 template <bool kIsLayoutSensitive>
-StatusOr<bool> CombineConstants(HloComputation* computation) {
+absl::StatusOr<bool> CombineConstants(HloComputation* computation,
+                                      bool only_scalars) {
   // Populating the domain map is somewhat expensive -- only do it if there are
   // kDomain ops in the computation.  If there are no kDomain ops, the domain
   // map is trivial, every op gets mapped to the same domain.
@@ -85,6 +87,10 @@ StatusOr<bool> CombineConstants(HloComputation* computation) {
     // invalidated due to deletion.
     ++inst_it;
 
+    if (only_scalars && !ShapeUtil::IsScalar(instruction->shape())) {
+      continue;
+    }
+
     HloInstruction* match = nullptr;
     if (auto* constant_inst = DynCast<HloConstantInstruction>(instruction)) {
       auto insert_result = constants.insert(ConstantKey<kIsLayoutSensitive>{
@@ -249,10 +255,11 @@ StatusOr<bool> HloCSE::Run(
       continue;
     }
 
-    TF_ASSIGN_OR_RETURN(bool combined,
-                        is_layout_sensitive_
-                            ? CombineConstants<true>(computation)
-                            : CombineConstants<false>(computation));
+    TF_ASSIGN_OR_RETURN(
+        bool combined,
+        is_layout_sensitive_
+            ? CombineConstants<true>(computation, only_scalars_)
+            : CombineConstants<false>(computation, only_scalars_));
     changed |= combined;
 
     // HLO instructions are grouped into equivalency classes by using the
@@ -274,6 +281,10 @@ StatusOr<bool> HloCSE::Run(
         continue;
       }
 
+      if (only_scalars_ && !ShapeUtil::IsScalar(instruction->shape())) {
+        continue;
+      }
+
       auto pair = representatives.insert(CseKey{instruction});
       if (!pair.second) {
         HloInstruction* equivalent_instruction = pair.first->hlo;
@@ -282,6 +293,8 @@ StatusOr<bool> HloCSE::Run(
         TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(
             instruction, /*cleanup=*/std::nullopt,
             ignore_control_dependencies_));
+        VLOG(4) << "Replaced " << instruction->name() << " with "
+                << equivalent_instruction->name();
         changed = true;
         continue;
       }
diff --git a/third_party/xla/xla/service/hlo_cse.h b/third_party/xla/xla/service/hlo_cse.h
index cdf1d702cd4bf1..8e87e4f8ab9100 100644
--- a/third_party/xla/xla/service/hlo_cse.h
+++ b/third_party/xla/xla/service/hlo_cse.h
@@ -33,10 +33,12 @@ class HloCSE : public HloModulePass {
   // when replacing instructions with their equivalents.
   explicit HloCSE(bool is_layout_sensitive,
                   bool only_fusion_computations = false,
-                  bool ignore_control_dependencies = false)
+                  bool ignore_control_dependencies = false,
+                  bool only_scalars = false)
       : is_layout_sensitive_(is_layout_sensitive),
         only_fusion_computations_(only_fusion_computations),
-        ignore_control_dependencies_(ignore_control_dependencies) {}
+        ignore_control_dependencies_(ignore_control_dependencies),
+        only_scalars_(only_scalars) {}
   ~HloCSE() override = default;
   absl::string_view name() const override { return "cse"; }
 
@@ -51,6 +53,7 @@ class HloCSE : public HloModulePass {
   const bool is_layout_sensitive_;
   const bool only_fusion_computations_;
   const bool ignore_control_dependencies_;
+  const bool only_scalars_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_cse_test.cc b/third_party/xla/xla/service/hlo_cse_test.cc
index 5364d21321e661..bd93f7cd88b8cc 100644
--- a/third_party/xla/xla/service/hlo_cse_test.cc
+++ b/third_party/xla/xla/service/hlo_cse_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -709,6 +710,32 @@ TEST_F(HloCseTest, OptimizationBarrier) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(HloCseTest, OnlyScalar) {
+  const char* const hlo_string = R"(
+    HloModule m
+
+    ENTRY entry {
+      %const1 = f32[] constant(1)
+      %const2 = f32[] constant(1)
+      %const3 = f32[2] constant({1,2})
+      %const4 = f32[2] constant({1,2})
+      %add.0 = f32[] add(%const1, %const2)
+      %add.1 = f32[2] add(%const3, %const4)
+      ROOT out = (f32[], f32[2]) tuple(%add.0, %add.1)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  HloCSE cse(/*is_layout_sensitive=*/false, /*only_fusion_computations=*/false,
+             /*ignore_control_dependencies=*/false, /*only_scalars=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&cse, m.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(absl::c_count_if(m->entry_computation()->instructions(),
+                             [](const HloInstruction* instruction) {
+                               return instruction->IsConstant();
+                             }),
+            3);
+}
+
 class HloCseCustomCallTest
     : public HloCseTest,
       public ::testing::WithParamInterface<std::tuple<
diff --git a/third_party/xla/xla/service/hlo_dfs_reachability_test.cc b/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
index 27a3c402c606c7..9bc77c75f42ea1 100644
--- a/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
+++ b/third_party/xla/xla/service/hlo_dfs_reachability_test.cc
@@ -168,8 +168,9 @@ class HloDfsReachabilityBenchmark {
 };
 
 void BM_HloDfsReachabilityBuild(benchmark::State& state) {
-  HloDfsReachabilityBenchmark bm(state.range(0), state.name());
-  for (auto s : state) {
+  int num_nodes = state.range(0);
+  HloDfsReachabilityBenchmark bm(num_nodes, state.name());
+  while (state.KeepRunningBatch(num_nodes)) {
     benchmark::DoNotOptimize(bm.Build());
   }
 }
diff --git a/third_party/xla/xla/service/hlo_module_config.cc b/third_party/xla/xla/service/hlo_module_config.cc
index bef8a0417c2b3b..4280bf3efd2d74 100644
--- a/third_party/xla/xla/service/hlo_module_config.cc
+++ b/third_party/xla/xla/service/hlo_module_config.cc
@@ -150,7 +150,7 @@ static void AssignProtoDotConfig(
     for (int64_t val : list_vector) {
       list.add_vals(val);
     }
-    proto.mutable_dot_config()->insert({key, std::move(list)});
+    proto.mutable_dot_config()->try_emplace(key, std::move(list));
   }
 }
 
@@ -198,7 +198,7 @@ static void AssignProtoPhaseOrderingConfig(
     pair.output_shape_index.assign(output_idx.begin(), output_idx.end());
     cfg_pairs.push_back(pair);
   }
-  config.set_shardable_value_update_pairs(cfg_pairs);
+  config.set_shardable_value_update_pairs(std::move(cfg_pairs));
 }
 
 static void AssignStructFusionConfig(HloModuleConfig& config,
diff --git a/third_party/xla/xla/service/hlo_module_config.h b/third_party/xla/xla/service/hlo_module_config.h
index 9a5e415bfa099c..4a137c59c426fb 100644
--- a/third_party/xla/xla/service/hlo_module_config.h
+++ b/third_party/xla/xla/service/hlo_module_config.h
@@ -169,7 +169,7 @@ class HloModuleConfig {
     return param_requires_broadcast_via_collectives_;
   }
   void set_param_requires_broadcast_via_collectives(
-      const std::vector<bool> require_broadcast) {
+      std::vector<bool> require_broadcast) {
     param_requires_broadcast_via_collectives_ = std::move(require_broadcast);
   }
 
@@ -195,16 +195,16 @@ class HloModuleConfig {
   }
 
   void set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t> mesh_shape) {
-    auto_spmd_partitioning_mesh_shape_ = mesh_shape;
+    auto_spmd_partitioning_mesh_shape_ = std::move(mesh_shape);
   }
-  std::vector<int64_t> auto_spmd_partitioning_mesh_shape() const {
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_shape() const {
     return auto_spmd_partitioning_mesh_shape_;
   }
 
   void set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t> mesh_ids) {
-    auto_spmd_partitioning_mesh_ids_ = mesh_ids;
+    auto_spmd_partitioning_mesh_ids_ = std::move(mesh_ids);
   }
-  std::vector<int64_t> auto_spmd_partitioning_mesh_ids() const {
+  const std::vector<int64_t>& auto_spmd_partitioning_mesh_ids() const {
     return auto_spmd_partitioning_mesh_ids_;
   }
 
diff --git a/third_party/xla/xla/service/hlo_module_util.cc b/third_party/xla/xla/service/hlo_module_util.cc
index fab668e4d4e6db..44fd94a1033786 100644
--- a/third_party/xla/xla/service/hlo_module_util.cc
+++ b/third_party/xla/xla/service/hlo_module_util.cc
@@ -112,16 +112,12 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     }
     config->set_use_auto_spmd_partitioning(
         execution_options->use_auto_spmd_partitioning());
-    std::vector<int64_t> mesh_shape;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_shape()) {
-      mesh_shape.push_back(t);
-    }
-    config->set_auto_spmd_partitioning_mesh_shape(mesh_shape);
-    std::vector<int64_t> mesh_ids;
-    for (auto t : execution_options->auto_spmd_partitioning_mesh_ids()) {
-      mesh_ids.push_back(t);
-    }
-    config->set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+    config->set_auto_spmd_partitioning_mesh_shape(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_shape().begin(),
+        execution_options->auto_spmd_partitioning_mesh_shape().end()));
+    config->set_auto_spmd_partitioning_mesh_ids(std::vector<int64_t>(
+        execution_options->auto_spmd_partitioning_mesh_ids().begin(),
+        execution_options->auto_spmd_partitioning_mesh_ids().end()));
     config->set_deduplicate_hlo(execution_options->deduplicate_hlo());
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index da0ca7362b5d2a..5a097d2d80565d 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -5214,19 +5214,26 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     count = get-tuple-element(%param), index=0
 
     recv.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
 
     c1 = u32[] constant(1)
     new_count = u32[] add(count, c1)
 
     send.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=2
-    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1
+    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     after-all.0.n = token[] after-all()
     recv.0.n = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
 
@@ -5234,7 +5241,8 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     send.0.n = (u32[2], u32[], token[]) send(recv-data.0, after-all.1.n),
       channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     ROOT result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(new_count, recv.0.n, send.0.n)
@@ -5246,23 +5254,31 @@ TEST_F(HloParserTest, PipelinedSendRecv) {
     after-all.0.p = token[] after-all()
     recv.0.p = (u32[2], u32[], token[]) recv(after-all.0.p), channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     after-all.1.p = token[] after-all()
     send.0.p = (u32[2], u32[], token[]) send(init, after-all.1.p),
       channel_id=1,
       frontend_attributes={
-        _xla_send_recv_source_target_pairs="{{1,0}}"
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
       }
 
     while_init = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) tuple(c0, recv.0.p, send.0.p)
     while_result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) while(while_init), body=body, condition=cond
 
     recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
-    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1
+    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
-    send-done.0.q = token[] send-done(send.0.q), channel_id=1
+    send-done.0.q = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
       })";
diff --git a/third_party/xla/xla/service/hlo_rematerialization.cc b/third_party/xla/xla/service/hlo_rematerialization.cc
index 3197bb094b5f76..a524e3cca559dc 100644
--- a/third_party/xla/xla/service/hlo_rematerialization.cc
+++ b/third_party/xla/xla/service/hlo_rematerialization.cc
@@ -2162,6 +2162,10 @@ absl::StatusOr<int64_t> RematerializeInstructions(
       VLOG(2) << "The old instruction " << best->name()
               << " is an async op. Removing to maintain one start to one done "
                  "invariant to keep the HLO valid.";
+      // We need to remove all control dependencies from best before removing it
+      // from the computation.  Its control dependencies were previously copied
+      // to the remat instruction.
+      TF_RETURN_IF_ERROR(best->DropAllControlDeps());
       TF_RETURN_IF_ERROR(computation->RemoveInstruction(best));
     }
   }
diff --git a/third_party/xla/xla/service/hlo_replication_analysis.cc b/third_party/xla/xla/service/hlo_replication_analysis.cc
index 7b3f966728e3f1..b92b6fe816e3df 100644
--- a/third_party/xla/xla/service/hlo_replication_analysis.cc
+++ b/third_party/xla/xla/service/hlo_replication_analysis.cc
@@ -98,9 +98,13 @@ HloReplicationAnalysis::DetermineHloInstructionIsReplicated(
         bool replicated_across_replicas = true;
         const int64_t num_partitions =
             hlo->GetModule()->config().num_partitions();
+        absl::flat_hash_set<int64_t> visited_partitions;
+        absl::flat_hash_set<int64_t> visited_replicas;
         for (const auto& group : hlo->replica_groups()) {
-          absl::flat_hash_set<int64_t> visited_partitions;
-          absl::flat_hash_set<int64_t> visited_replicas;
+          visited_partitions.clear();
+          visited_replicas.clear();
+          visited_replicas.reserve(group.replica_ids().size());
+          visited_partitions.reserve(group.replica_ids().size());
           for (int64_t id : group.replica_ids()) {
             int64_t rid = id / num_partitions;
             int64_t pid = id % num_partitions;
diff --git a/third_party/xla/xla/service/hlo_value.cc b/third_party/xla/xla/service/hlo_value.cc
index 43ae714ac77ff5..a74e43b5996ec6 100644
--- a/third_party/xla/xla/service/hlo_value.cc
+++ b/third_party/xla/xla/service/hlo_value.cc
@@ -114,15 +114,13 @@ namespace {
 // ShapeIndex in the given operand. Generally, instruction which pass through
 // values transparently without reading the value are not considered to use the
 // value.
-bool MayUseOperandValue(int64_t operand_number, const ShapeIndex& index,
-                        const HloInstruction* user) {
+bool MayUseOperandValue(const ShapeIndex& index, const HloInstruction* user) {
   switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kCopy:
       // These instructions only access the top-level values of their
       // operand. Non-top-level (nested) values are passed through
       // transparently.
-      CHECK_EQ(operand_number, 0);
       return index.empty();
     case HloOpcode::kDomain:
     case HloOpcode::kTuple:
@@ -172,6 +170,19 @@ HloValue::Uses HloValue::ComputeUses() const {
   // Build vector of HloUses for the value.
   for (const HloPosition& position : positions_) {
     for (HloInstruction* const user : position.instruction->users()) {
+#ifndef NDEBUG
+      // If user is in the root positions of this value, it must be a root.
+      if (root_positions.contains(user)) {
+        CHECK(user->IsRoot());
+      }
+#endif  // NDEBUG
+      // Root instructions of computations are considered to be uses whether
+      // or not the root instruction itself actually uses the value.
+      if (!MayUseOperandValue(position.index, user) &&
+          !(user->IsRoot() && root_positions.contains(user))) {
+        continue;
+      }
+
       int i = -1;
       for (const auto& operand : user->operands()) {
         ++i;
@@ -180,28 +191,19 @@ HloValue::Uses HloValue::ComputeUses() const {
           continue;
         }
 
+        uses.emplace_back(user, i, position.index);
 #ifndef NDEBUG
-        // If user is in the root positions of this value, it must be a root.
-        if (root_positions.contains(user)) {
-          CHECK(user->IsRoot());
+        // The new use must not already exist in uses.
+        for (int index = 0; index + 1 < uses.size(); ++index) {
+          DCHECK_NE(uses[index], uses.back());
         }
 #endif  // NDEBUG
-
-        // Root instructions of computations are considered to be uses whether
-        // or not the root instruction itself actually uses the value.
-        if (MayUseOperandValue(i, position.index, user) ||
-            (user->IsRoot() && root_positions.contains(user))) {
-          HloUse new_use{user, i, position.index};
-
-#ifndef NDEBUG
-          // The new use must not already exist in uses.
-          for (const HloUse& use : uses) {
-            DCHECK_NE(use, new_use);
-          }
-#endif  // NDEBUG
-
-          uses.push_back(std::move(new_use));
-        }
+      }
+      // In case of HloOpcode::kGetTupleElement or HloOpcode::kCopy instruction,
+      // ensure that user has at most one operand.
+      if (user->opcode() == HloOpcode::kGetTupleElement ||
+          user->opcode() == HloOpcode::kCopy) {
+        CHECK_LE(i, 0);
       }
     }
   }
diff --git a/third_party/xla/xla/service/hlo_value.h b/third_party/xla/xla/service/hlo_value.h
index 2152b27825d413..3f4f4699cde544 100644
--- a/third_party/xla/xla/service/hlo_value.h
+++ b/third_party/xla/xla/service/hlo_value.h
@@ -81,6 +81,15 @@ struct HloUse {
   // The shape index within the operand in which the value appears.
   ShapeIndex operand_index;
 
+  HloUse() = default;
+  HloUse(HloInstruction* instruction, int64_t operand_number)
+      : instruction(instruction), operand_number(operand_number) {}
+  HloUse(HloInstruction* instruction, int64_t operand_number,
+         ShapeIndex operand_index)
+      : instruction(instruction),
+        operand_number(operand_number),
+        operand_index(std::move(operand_index)) {}
+
   std::string ToString() const;
 
   bool operator==(const HloUse& other) const {
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index 5682defc527c9f..751756ccdab359 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/hlo_verifier.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <map>
 #include <memory>
@@ -27,6 +28,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -39,6 +41,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/layout.h"
 #include "xla/permutation_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
@@ -1157,11 +1160,12 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           (ShapeUtil::ArrayDataSize(output_shape) ==
            ShapeUtil::ArrayDataSize(operand_shape)))) {
       return Internal(
-          "Bitcast cannot have different shape sizes of output (%d) and "
+          "%s: Bitcast cannot have different shape sizes of output (%d) and "
           "operand "
           "(%d) (%s) (%s)",
-          opts_.shape_size(output_shape), opts_.shape_size(operand_shape),
-          output_shape.ToString(true), operand_shape.ToString(true));
+          bitcast->ToString(), opts_.shape_size(output_shape),
+          opts_.shape_size(operand_shape), output_shape.ToString(true),
+          operand_shape.ToString(true));
     }
   }
   return OkStatus();
@@ -1948,7 +1952,8 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
                   result_layout.shape(),
                   Shape::Equal()
                       .IgnoreTilesInLayout()
-                      .IgnoreTailPaddingAlignmentInElements())) {
+                      .IgnoreTailPaddingAlignmentInElements()
+                      .IgnoreMemorySpaceInLayout())) {
     return Internal(
         "Shape of the root instruction of entry computation (%s) should be "
         "compatible to one specified in module's entry computation layout (%s)",
@@ -1972,7 +1977,8 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
     if (!ShapesSame(parameter->shape(), layout.parameter_shape(i),
                     Shape::Equal()
                         .IgnoreTilesInLayout()
-                        .IgnoreTailPaddingAlignmentInElements())) {
+                        .IgnoreTailPaddingAlignmentInElements()
+                        .IgnoreMemorySpaceInLayout())) {
       return Internal(
           "Shape of the entry computation parameter %d is %s should be "
           "compatible to the one specified in module's entry computation "
@@ -2256,8 +2262,40 @@ Status VerifyChannels(const HloModule& module) {
   absl::flat_hash_map<int64_t, std::vector<const HloInstruction*>>
       channel_instructions;
 
-  // Send/Recv instruction must have a single user: the corresponding
-  // SendDone/RecvDone. with matching channel.
+  // For Async operations, we need to make sure:
+  // (1) AsyncStart and AsyncDone are used in pairs
+  // (2) AsynStart and Asyndone are connected, that is, an AsynDone has an
+  //     AsyncStart as its only operand, and an AsynStart has an AsyncDone as
+  //     its only user
+  // (3) the channel ID used by a pair of Async operations is unique
+  //
+  // Send and SendDone, Recv and RecvDone are such pairs of Async operations.
+  // Different from other Async operations, a channel ID can be used by one
+  // Send-SendDone pair and one Recv-RecvDone pair. As such, we verify the
+  // above three invariants for Send/Recv related instructions with adjustment
+  // to (3):
+  // (3*) the channel ID used by a pair of Send-SendDone can be shared by at
+  //       most one pair of Recv-RecvDone.
+  //
+  // Currently, the GPU compiler can decomposed collective-permute into a group
+  // of instructions with a pair of Send-SendDone and a pair of Recv-RecvDone
+  // that use the same channel ID. When a while-body contains such instructions,
+  // the GPU compiler can also peel off Send and Recv, and statically order
+  // SendDone/RecvDone inside the while-body before Send/Recv. This breaks
+  // invariants (2) and (3*) for the pipelined Send/Recv case. We verify the
+  // following for a group of instructions using the same channel ID but don't
+  // satisfy invariants (1)(2)(3*):
+  // (4) All instructions in the group are annotated with frontend attributes.
+  //     We avoid verifying the content of such a frontend attribute to avoid
+  //     making the general HLO instruction verifier depend on the compiler pass
+  //     that performs the transformation.
+  // (5) the group should contain equal number uses of each Send/Recv related
+  //     instructions.
+  //
+  // Comparing the verification of unpiplined Send/Recv with the verification
+  // of pipelined, what we missing verifying is that the direct connection
+  // between Send/Recv and SendDone/RecvDone through operands.
+  //
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       auto channel_instr = DynCast<HloChannelInstruction>(instruction);
@@ -2268,67 +2306,53 @@ Status VerifyChannels(const HloModule& module) {
 
       switch (instruction->opcode()) {
         case HloOpcode::kSend: {
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* send_user = instruction->users().front();
-          if (send_user->opcode() == HloOpcode::kSendDone) {
-            TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_user));
-            TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, send_user));
-          } else {
-            // If a Send user is not a SendDone, it has to be a tuple that is
-            // either the root of a while-body or the init of a while-loop.
-            TF_RET_CHECK(send_user->opcode() == HloOpcode::kTuple);
-            if (send_user != send_user->parent()->root_instruction()) {
-              TF_RET_CHECK(send_user->users().size() == 1);
-              const HloInstruction* user = send_user->users().front();
-              TF_RET_CHECK(user->opcode() == HloOpcode::kWhile);
+          bool pipelined = true;
+          if (instruction->users().size() == 1) {
+            const HloInstruction* send_user = instruction->users().front();
+            if (send_user->opcode() == HloOpcode::kSendDone) {
+              TF_RETURN_IF_ERROR(CheckSameChannel(instruction, send_user));
+              TF_RETURN_IF_ERROR(
+                  CheckSameIsHostTransfer(instruction, send_user));
+              pipelined = false;
             }
           }
+          // Pipelined Send should be annotated with frontend attributes.
+          TF_RET_CHECK(pipelined == false ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kRecv: {
-          TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* recv_user = instruction->users().front();
-          if (recv_user->opcode() == HloOpcode::kRecvDone) {
-            TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_user));
-            TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_user));
-          } else {
-            // If a Recv user is not a RecvDone, it has to be a tuple that is
-            // either the root of a while-body or the init of a while-loop.
-            TF_RET_CHECK(recv_user->opcode() == HloOpcode::kTuple);
-            if (recv_user != recv_user->parent()->root_instruction()) {
-              TF_RET_CHECK(recv_user->users().size() == 1);
-              const HloInstruction* user = recv_user->users().front();
-              TF_RET_CHECK(user->opcode() == HloOpcode::kWhile);
+          bool pipelined = true;
+          if (instruction->users().size() == 1) {
+            const HloInstruction* recv_user = instruction->users().front();
+            if (recv_user->opcode() == HloOpcode::kRecvDone) {
+              TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_user));
+              TF_RETURN_IF_ERROR(
+                  CheckSameIsHostTransfer(instruction, recv_user));
+              pipelined = false;
             }
           }
+          // Pipelined Recv should be annotated with frontend attributes.
+          TF_RET_CHECK(pipelined == false ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kSendDone: {
           TF_RET_CHECK(instruction->operands().size() == 1);
           const HloInstruction* send_done_operand = instruction->operand(0);
-          if (send_done_operand->opcode() != HloOpcode::kSend) {
-            // If the SendDone operand is not a Send, it has to be either part
-            // of a while-loop result or a parameter of a while-body.
-            TF_RET_CHECK(send_done_operand->opcode() ==
-                         HloOpcode::kGetTupleElement);
-            HloOpcode opcode = send_done_operand->operand(0)->opcode();
-            TF_RET_CHECK(opcode == HloOpcode::kWhile ||
-                         opcode == HloOpcode::kParameter);
-          }
+          // If the operand is not a Send, the Send-done is pipelined and should
+          // have frontend attributes.
+          TF_RET_CHECK(send_done_operand->opcode() == HloOpcode::kSend ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         case HloOpcode::kRecvDone: {
           TF_RET_CHECK(instruction->operands().size() == 1);
           const HloInstruction* recv_done_operand = instruction->operand(0);
-          if (recv_done_operand->opcode() != HloOpcode::kRecv) {
-            // If the RecvDone operand is not a Recv, it has to be either part
-            // of a while-loop result or a parameter of a while-body.
-            TF_RET_CHECK(recv_done_operand->opcode() ==
-                         HloOpcode::kGetTupleElement);
-            HloOpcode opcode = recv_done_operand->operand(0)->opcode();
-            TF_RET_CHECK(opcode == HloOpcode::kWhile ||
-                         opcode == HloOpcode::kParameter);
-          }
+          // If the operand is not a Recv, the Recv-done is pipelined and should
+          // have frontend attributes.
+          TF_RET_CHECK(recv_done_operand->opcode() == HloOpcode::kRecv ||
+                       !instruction->frontend_attributes().map().empty());
           break;
         }
         default:
@@ -2343,33 +2367,50 @@ Status VerifyChannels(const HloModule& module) {
     const HloInstruction* first = instructions[0];
     auto sendrecv = DynCast<HloSendRecvInstruction>(first);
     if (sendrecv) {
-      absl::flat_hash_set<HloOpcode> opcodes;
-      bool maybe_send_recv_pipeline = false;
+      // Check that all instructions are Send/Recv related and count the
+      // appearance of each opcode in the group.
+      absl::flat_hash_map<HloOpcode, int> opcode_to_count;
       for (const HloInstruction* instr : instructions) {
-        if (opcodes.insert(instr->opcode()).second == false) {
-          // A channel is used by multiple instructions with the same opcode.
-          // This is only allows for pipelining Send and Recv, assuming such
-          // instructions have non-empty frontend attributes.
-          if (DynCast<HloSendInstruction>(instr) ||
-              DynCast<HloRecvInstruction>(instr)) {
-            maybe_send_recv_pipeline =
-                (!instr->frontend_attributes().map().empty());
-          }
+        auto it = opcode_to_count.find(instr->opcode());
+        if (it != opcode_to_count.end()) {
+          it->second++;
+        } else {
+          opcode_to_count[instr->opcode()] = 1;
         }
-        auto cast = DynCast<HloSendRecvInstruction>(instr);
-        TF_RET_CHECK(cast != nullptr)
+        TF_RET_CHECK(DynCast<HloSendRecvInstruction>(instr) != nullptr)
             << "channel " << pair.first
             << " is used for different types of channel instructions";
       }
+
+      int count = opcode_to_count.begin()->second;
+      bool consistent_count =
+          absl::c_all_of(opcode_to_count, [count](const auto& opcode_count) {
+            return opcode_count.second == count;
+          });
+      // A pipelined group of Send/Recv should all have frontend attributes.
+      bool maybe_pipelined =
+          absl::c_all_of(instructions, [](const HloInstruction* inst) {
+            return !inst->frontend_attributes().map().empty();
+          });
+
       if (sendrecv->is_host_transfer()) {
-        TF_RET_CHECK(instructions.size() == 2)
+        TF_RET_CHECK(consistent_count && count == 1 && instructions.size() == 2)
             << "channel " << pair.first
             << " is used for multiple host send/recv instructions";
       } else {
-        if (!maybe_send_recv_pipeline) {
-          TF_RET_CHECK(instructions.size() == opcodes.size())
+        if (consistent_count && count == 1) {
+          TF_RET_CHECK(instructions.size() == opcode_to_count.size())
               << "channel " << pair.first
               << " is used for multiple send/recv instructions";
+        } else {
+          TF_RET_CHECK(maybe_pipelined) << "channel " << pair.first
+                                        << " is used for multiple send/recv "
+                                           "instructions but not pipelined";
+          TF_RET_CHECK(consistent_count && opcode_to_count.size() % 2 == 0)
+              << "channel " << pair.first
+              << " is pipelined. Not all Send/Recv related instructions are"
+                 " used the same number of times or channel is used for other "
+                 "instructions";
         }
       }
     } else {
@@ -2772,10 +2813,18 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
             operand_shape.rank() == result_shape.rank() &&
             operand_shape.has_layout()) {
           const Layout& operand_layout = operand_shape.layout();
-          Layout::Equal equal_predicate = Layout::Equal();
+          Layout::Equal equal_predicate =
+              Layout::Equal().IgnoreTiles().IgnoreMemorySpace();
           if (instruction->opcode() == HloOpcode::kConvert) {
             // Convert instructions can change element_size_in_bits
             equal_predicate.IgnoreElementSize();
+          } else if (instruction->opcode() == HloOpcode::kDynamicSlice ||
+                     instruction->opcode() == HloOpcode::kDynamicUpdateSlice ||
+                     instruction->opcode() == HloOpcode::kCopy) {
+            TF_RETURN_IF_ERROR(HostOffloadInstructionCanChangeMemorySpace(
+                instruction, operand_layout.memory_space(),
+                result_layout.memory_space()));
+            equal_predicate.IgnoreMemorySpace();
           }
           TF_RET_CHECK(equal_predicate(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
@@ -2809,6 +2858,39 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return OkStatus();
   }
 
+  // Verifies whether a given `instruction` is permitted to change the layout
+  // memory space from `operand_memory_space` to `result_memory_space`.
+  // Returns OkStatus() if the instruction's layout changes are valid;
+  // otherwise, returns an appropriate error status.
+  static Status HostOffloadInstructionCanChangeMemorySpace(
+      const HloInstruction* instruction, const int64_t operand_memory_space,
+      const int64_t result_memory_space) {
+    TF_RET_CHECK(!(operand_memory_space == Layout::kGenericFastMemorySpace &&
+                   result_memory_space != Layout::kGenericFastMemorySpace) ||
+                 (operand_memory_space != Layout::kGenericFastMemorySpace &&
+                  result_memory_space == Layout::kGenericFastMemorySpace))
+        << "Instruction shouldn't change layout memory space between generic "
+           "fast memory space and others for instruction: "
+        << instruction->ToString();
+
+    if (instruction->opcode() == HloOpcode::kDynamicSlice) {
+      TF_RET_CHECK(!(operand_memory_space == Layout::kDefaultMemorySpace &&
+                     result_memory_space == Layout::kHostMemorySpace))
+          << "DynamicSlice instruction shouldn't change layout memory "
+          << "space from device to host: " << instruction->ToString();
+    } else if (instruction->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      TF_RET_CHECK(!(operand_memory_space == Layout::kHostMemorySpace &&
+                     result_memory_space == Layout::kDefaultMemorySpace))
+          << "DynamicUpdateSlice instruction shouldn't change layout "
+          << "memory space from host to device: " << instruction->ToString();
+    } else if (instruction->opcode() != HloOpcode::kCopy) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Instruction shouldn't change layout memory space: ",
+                       instruction->ToString()));
+    }
+    return OkStatus();
+  }
+
   absl::flat_hash_map<std::string, const HloInstruction*> instructions_by_name_;
   const HloVerifierOpts& opts_;
   std::optional<int64_t> num_devices_;
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index b03a88a6c98bba..15dd3208efb25a 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -2043,6 +2043,95 @@ TEST_F(HloVerifierTest, ChannelVerifier) {
               HasSubstr("used for different types of channel instructions"));
 }
 
+TEST_F(HloVerifierTest, ChannelVerifierPipelinedMissingDones) {
+  const char* const kModuleStr = R"(
+  HloModule test
+  cond {
+    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(1)
+    ROOT result = pred[] compare(count, ub), direction=LT
+  }
+
+  body {
+    param = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+
+    recv.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
+
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+
+    send.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=2
+    send-done.0 = (u32[2], token[]) recv-done(send.0), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.0.n = token[] after-all()
+    recv.0.n = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+
+    after-all.1.n = token[] after-all()
+    send.0.n = (u32[2], u32[], token[]) send(recv-data.0, after-all.1.n),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    ROOT result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      tuple(new_count, recv.0.n, send.0.n)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    init = u32[2] broadcast(c0), dimensions={}
+    after-all.0.p = token[] after-all()
+    recv.0.p = (u32[2], u32[], token[]) recv(after-all.0.p), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    after-all.1.p = token[] after-all()
+    send.0.p = (u32[2], u32[], token[]) send(init, after-all.1.p),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}",
+        _xla_send_recv_pipeline="0"
+      }
+
+    while_init = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      tuple(c0, recv.0.p, send.0.p)
+    while_result = (u32[], (u32[2], u32[], token[]), (u32[2], u32[], token[]))
+      while(while_init), body=body, condition=cond
+
+    recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
+    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+
+    ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
+      })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+  EXPECT_THAT(
+      verifier().Run(module.get()).status().message(),
+      HasSubstr("is pipelined. Not all Send/Recv related instructions are used"
+                " the same number of times"));
+}
+
 TEST_F(HloVerifierTest, CollectiveChannelVerifier) {
   const char* const kModuleStr = R"(
   HloModule test
@@ -2874,5 +2963,66 @@ TEST_F(HloVerifierTest, SparseDotMetadataShape) {
   EXPECT_THAT(status.message(), HasSubstr("Expected sparse dot metadata"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingDUSAndDSAreVerifiedWhenChangingLayout) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    constant_f32_0 = f32[] constant(0)
+    custom-call = f32[2,2048,2048]{2,1,0:S(5)} custom-call(), custom_call_target="AllocateBuffer"
+    data_param = f32[1,2048,2048]{2,1,0} parameter(0)
+    index_param = s32[] parameter(1)
+    constant_s32_0 = s32[] constant(0)
+    dynamic_update_slice = f32[2,2048,2048]{2,1,0:S(5)} dynamic-update-slice(custom-call, data_param, index_param, constant_s32_0, constant_s32_0)
+    ROOT dynamic_slice = f32[1,2048,2048]{2,1,0} dynamic-slice(f32[2,2048,2048]{2,1,0:S(5)} dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingCopyIsVerifiedWhenChangingLayout) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    data_param = f32[2048]{0} parameter(0)
+    copy_0 = f32[2048]{0:S(5)} copy(f32[2048]{0} data_param)
+    ROOT copy_1 = f32[2048]{0} copy(f32[2048]{0:S(5)} copy_0)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive,
+       HostOffloadingDSCannotChangeLayoutFromDeviceToHost) {
+  const char* const hlo_string = R"(
+  HloModule m
+
+  ENTRY main {
+    constant_f32_0 = f32[] constant(0)
+    custom-call = f32[2,2048,2048]{2,1,0} custom-call(), custom_call_target="AllocateBuffer"
+    data_param = f32[1,2048,2048]{2,1,0} parameter(0)
+    index_param = s32[] parameter(1)
+    constant_s32_0 = s32[] constant(0)
+    dynamic_update_slice = f32[2,2048,2048]{2,1,0} dynamic-update-slice(custom-call, data_param, index_param, constant_s32_0, constant_s32_0)
+    ROOT dynamic_slice = f32[1,2048,2048]{2,1,0:S(5)} dynamic-slice(f32[2,2048,2048]{2,1,0} dynamic_update_slice, index_param, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,2048,2048}
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(),
+              HasSubstr("DynamicSlice instruction shouldn't change layout "
+                        "memory space from device to host"));
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
index a56a5aad1ddc4c..c6349f7680a544 100644
--- a/third_party/xla/xla/service/host_offload_legalize.cc
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
-#include <string>
+#include <memory>
 #include <utility>
 #include <vector>
 
@@ -26,15 +26,18 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/host_memory_offload_annotations.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -47,7 +50,7 @@ constexpr std::array<HloOpcode, 2> kUsersOpcodes = {HloOpcode::kSlice,
                                                     HloOpcode::kDynamicSlice};
 
 // Find an annotation moving up. Meant to find an annotation from a DUS operand.
-HloInstruction* FindAnnotationToUpdate(HloInstruction* instr) {
+HloInstruction* FindToHostAnnotationToUpdate(HloInstruction* instr) {
   while (!instr->IsCustomCall(
       host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
     if ((instr->opcode() != HloOpcode::kBitcast &&
@@ -61,7 +64,8 @@ HloInstruction* FindAnnotationToUpdate(HloInstruction* instr) {
   return instr;
 }
 
-// Find an annotation moving up. Meant to find an annotation from a DUS operand.
+// Find an annotation moving up. Meant to find an annotation from a DUS
+// instruction.
 HloInstruction* FindToDeviceAnnotationToUpdate(HloInstruction* instr) {
   while (!instr->IsCustomCall(
       host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
@@ -330,8 +334,16 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     HloInstruction* instruction, const CallGraph* call_graph,
     absl::flat_hash_set<HloInstruction*>& processed_annotations,
     std::vector<HloInstruction*>& to_remove) {
+  auto is_entry_computation_parameter = [](HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kParameter &&
+           instruction->parent()->IsEntryComputation();
+  };
+
+  if (instruction->IsRoot()) {
+    return false;
+  }
   HloInstruction* starting_instr =
-      FindDUSFromAnnotation(instruction->users()[0]);
+      FindDUSFromAnnotation(instruction->users().at(0));
   // If it's the pure copy case reset instruction.
   if (starting_instr->opcode() != HloOpcode::kDynamicUpdateSlice) {
     starting_instr = instruction;
@@ -343,7 +355,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
   // to update (required in case there are multiple insertions in the buffer).
   processed_annotations.insert(current_value.first);
   if (!current_value.first->IsCustomCall(
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget) &&
+      !is_entry_computation_parameter(current_value.first)) {
     CHECK_EQ(current_value.first->opcode(), HloOpcode::kDynamicUpdateSlice);
     while (true) {
       VLOG(10) << "Current value before: " << current_value.first->ToString();
@@ -361,7 +374,7 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
       HloInstruction* annotation = current_value.first;
       if (annotation->opcode() == HloOpcode::kDynamicUpdateSlice) {
         HloInstruction* real_annotation =
-            FindAnnotationToUpdate(annotation->mutable_operand(1));
+            FindToHostAnnotationToUpdate(annotation->mutable_operand(1));
         // Check if this dynamic-update-slice doesn't have an annotation
         // attached.
         if (!real_annotation->IsCustomCall(
@@ -406,9 +419,12 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
       }
     }
   }
+
   auto update_shape_layout =
       [&](const std::pair<HloInstruction*, int>& instruction,
           HloInstruction* copy_to_move) {
+        VLOG(5) << "Update shape layout: " << instruction.first->ToString()
+                << " " << instruction.second;
         // Update shape. Tuple shape vs array shape.
         if (instruction.second != -1) {
           *instruction.first->mutable_shape()
@@ -418,7 +434,24 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
           *instruction.first->mutable_shape()->mutable_layout() =
               copy_to_move->operand(0)->shape().layout();
         }
+
+        if (instruction.first->opcode() == HloOpcode::kWhile) {
+          // Fix up while body's root instruction shape and condition's
+          // parameter shape for while loops.
+          Shape new_shape = copy_to_move->operand(0)->shape();
+          *instruction.first->while_body()
+               ->root_instruction()
+               ->mutable_shape()
+               ->mutable_tuple_shapes(instruction.second)
+               ->mutable_layout() = new_shape.layout();
+          *instruction.first->while_condition()
+               ->parameter_instruction(0)
+               ->mutable_shape()
+               ->mutable_tuple_shapes(instruction.second)
+               ->mutable_layout() = new_shape.layout();
+        }
       };
+
   // Process all copies one at a time from the last to the first and push it to
   // its specific user.
   while (!copies_to_move.empty()) {
@@ -427,8 +460,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
     stack.clear();
     stack.push_back(copy_to_move);
     while (!stack.empty()) {
-      VLOG(5) << "Current value before down: "
-              << stack.back().first->ToString();
+      VLOG(5) << "Current value before down: " << stack.back().first->ToString()
+              << " " << stack.back().second;
       auto current_value_down =
           WalkDownMemoryOffload(stack.back(), *call_graph);
       if (!current_value_down.ok()) {
@@ -445,8 +478,25 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
                 "Expected to be called only by one caller");
           }
           auto* caller = callers[0];
-          update_shape_layout(std::make_pair(caller, instruction.second),
-                              copy_to_move.first);
+          if (caller->opcode() == HloOpcode::kWhile) {
+            update_shape_layout(std::make_pair(caller, instruction.second),
+                                copy_to_move.first);
+
+            HloInstruction* root_instruction =
+                caller->while_body()->root_instruction();
+            // Fix while loop's result tuple to not use move-to-device since
+            // at loop entry it's still on host.
+            if (root_instruction->operand(instruction.second)
+                    ->IsCustomCall(host_memory_offload_annotations::
+                                       kMoveToDeviceCustomCallTarget)) {
+              root_instruction
+                  ->ReplaceOperandWith(
+                      instruction.second,
+                      root_instruction->mutable_operand(instruction.second)
+                          ->mutable_operand(0))
+                  .IgnoreError();
+            }
+          }
         }
       }
       stack.pop_back();
@@ -473,8 +523,11 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
           }
           update_shape_layout(std::make_pair(new_annotation, -1),
                               copy_to_move.first);
+          Shape new_copy_shape = new_annotation->shape();
+          *new_copy_shape.mutable_layout() =
+              copy_to_move.first->shape().layout();
           HloInstruction* new_copy = instruction.first->AddInstruction(
-              copy_to_move.first->CloneWithNewOperands(new_annotation->shape(),
+              copy_to_move.first->CloneWithNewOperands(new_copy_shape,
                                                        {new_annotation}));
           std::vector<HloInstruction*> users = instruction.first->users();
           for (auto* use : users) {
@@ -495,8 +548,8 @@ absl::StatusOr<bool> ProcessAnnotationForCopyMovement(
         // Move the annotation first just before dynamic-update-slice to avoid
         // shape changes.
         if (instruction.first->opcode() == HloOpcode::kDynamicUpdateSlice) {
-          HloInstruction* annotation =
-              FindAnnotationToUpdate(instruction.first->mutable_operand(1));
+          HloInstruction* annotation = FindToHostAnnotationToUpdate(
+              instruction.first->mutable_operand(1));
           if (annotation == nullptr) {
             CHECK(false);
             return false;
@@ -535,7 +588,7 @@ absl::StatusOr<bool> FixupInterveningCopies(
   std::vector<HloInstruction*> annotations_to_remove;
   bool changed = false;
   for (HloInstruction* instruction : copy_to_host_annotations) {
-    if (processed_annotations.count(instruction)) {
+    if (processed_annotations.contains(instruction)) {
       continue;
     }
     TF_ASSIGN_OR_RETURN(bool changed_annotation_for_copy_movement,
@@ -576,11 +629,22 @@ absl::StatusOr<bool> HostOffloadLegalize::Run(
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kCustomCall) {
-        continue;
+      if (instruction->opcode() == HloOpcode::kParameter &&
+          instruction->parent()->IsEntryComputation()) {
+        Shape param_shape =
+            module->entry_computation_layout()
+                .parameter_layout(instruction->parameter_number())
+                .shape();
+        // TODO(mingyao): Add support for tuple parameter.
+        if (param_shape.has_layout() &&
+            param_shape.layout().memory_space() == kHostMemorySpaceColor) {
+          copy_to_host_annotations.push_back(instruction);
+          continue;
+        }
       }
-      if (instruction->custom_call_target() ==
-          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+
+      if (instruction->IsCustomCall(
+              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
         copy_to_host_annotations.push_back(instruction);
       }
     }
diff --git a/third_party/xla/xla/service/host_offload_legalize_test.cc b/third_party/xla/xla/service/host_offload_legalize_test.cc
index a1abd7e0a188b9..f9929648a3f12f 100644
--- a/third_party/xla/xla/service/host_offload_legalize_test.cc
+++ b/third_party/xla/xla/service/host_offload_legalize_test.cc
@@ -78,7 +78,7 @@ class HostOffloadLegalizeTest : public HloTestBase {
 
 TEST_F(HostOffloadLegalizeTest, NoCopyWithOptBarrierMoreElaborate) {
   const std::string& hlo_string = R"(
-HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{0,1}}
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{1,0}}
 
 ENTRY main.24 {
   Arg_0.1 = f32[16,256]{0,1} parameter(0)
@@ -120,6 +120,62 @@ ENTRY main.24 {
   HloInstruction* custom_call = FindInstruction(module.get(), "custom-call.18");
   EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
   EXPECT_EQ(custom_call->shape().layout(), LayoutUtil::MakeLayout({0, 1}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
+}
+
+TEST_F(HostOffloadLegalizeTest, XposeCopyOnParameterStreaming) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1},f32[16,256]{0,1:T(8,128)S(5)})->f32[16,256]{1,0}}
+
+ENTRY main.24 {
+  Arg_0.1 = f32[16,256]{0,1} parameter(0)
+  Arg_0.2 = f32[16,256]{0,1:T(8,128)} parameter(1)
+  cp0 = f32[16,256]{1,0} copy(Arg_0.2)
+  cosine.4 = f32[16,256]{0,1} cosine(Arg_0.1)
+  custom-call.5 = f32[16,256]{0,1} custom-call(cosine.4), custom_call_target="MoveToHost"
+  sine.3 = f32[16,256]{0,1} sine(Arg_0.1)
+  cosine.7 = f32[16,256]{0,1} cosine(sine.3)
+  custom-call.8 = f32[16,256]{0,1} custom-call(cosine.7), custom_call_target="MoveToHost"
+  constant.2 = f32[] constant(1)
+  cp1 = f32[16,256]{1,0} copy(custom-call.8)
+  tuple.11 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{1,0}, f32[]) tuple(custom-call.5, cp1, cp0, constant.2)
+  opt-barrier.12 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{1,0}, f32[]) opt-barrier(tuple.11)
+  get-tuple-element.16 = f32[] get-tuple-element(opt-barrier.12), index=3
+  broadcast.20 = f32[16,256]{0,1} broadcast(get-tuple-element.16), dimensions={}
+  get-tuple-element.15 = f32[16,256]{1,0} get-tuple-element(opt-barrier.12), index=2
+  custom-call.19 = f32[16,256]{1,0} custom-call(get-tuple-element.15), custom_call_target="MoveToDevice"
+  multiply.21 = f32[16,256]{0,1} multiply(broadcast.20, custom-call.19)
+  cp2 = f32[16,256]{1,0} copy(multiply.21)
+  get-tuple-element.14 = f32[16,256]{1,0} get-tuple-element(opt-barrier.12), index=1
+  custom-call.18 = f32[16,256]{1,0} custom-call(get-tuple-element.14), custom_call_target="MoveToDevice"
+  multiply.22 = f32[16,256]{1,0} multiply(cp2, custom-call.18)
+  get-tuple-element.13 = f32[16,256]{0,1} get-tuple-element(opt-barrier.12), index=0
+  custom-call.17 = f32[16,256]{0,1} custom-call(get-tuple-element.13), custom_call_target="MoveToDevice"
+  cp3 = f32[16,256]{1,0} copy(custom-call.17)
+  ROOT multiply.23 = f32[16,256]{1,0} multiply(multiply.22, cp3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloadLegalize(module.get()));
+
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  HloInstruction* custom_call = FindInstruction(module.get(), "custom-call.18");
+  EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(custom_call->shape().layout(), LayoutUtil::MakeLayout({0, 1}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
+
+  custom_call = FindInstruction(module.get(), "custom-call.19");
+  EXPECT_EQ(custom_call->users()[0]->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(custom_call->shape().layout(),
+            LayoutUtil::MakeLayout({0, 1}, {}, {}, {}, {Tile{{8, 128}}}));
+  EXPECT_EQ(custom_call->users()[0]->shape().layout(),
+            LayoutUtil::MakeLayout({1, 0}));
 }
 
 TEST_F(HostOffloadLegalizeTest, LlmActivationHostMemoryMultipleConsumers) {
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index b484d7fec3418b..747b877617b16c 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -22,9 +22,9 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -166,36 +165,96 @@ HloInstruction* FindDSAnnotation(HloInstruction* hlo) {
 
 }  // namespace
 
+absl::StatusOr<bool> HostOffloader::TryOutputStreaming(
+    HloInstruction* custom_call) {
+  const HloBuffer& unique_buffer =
+      alias_analysis_->GetUniqueBufferAt(custom_call);
+  bool is_used_as_output_with_host_memory_space = false;
+  const HloComputation* const entry_computation =
+      custom_call->GetModule()->entry_computation();
+  for (const HloValue* value : unique_buffer.values()) {
+    // Check if this is memory-only.
+    if (!AllPositionsAreAllowed(value)) {
+      // Found a position which is not allowed.
+      return false;
+    }
+
+    // Look for a value used as a output.
+    for (const auto& position : value->positions()) {
+      const HloInstruction* instruction = position.instruction;
+      const ShapeIndex& index = position.index;
+      if (instruction->parent() == entry_computation && instruction->IsRoot()) {
+        const Shape& output_shape =
+            ShapeUtil::GetSubshape(entry_computation->parent()
+                                       ->entry_computation_layout()
+                                       .result_shape(),
+                                   index);
+        CHECK(output_shape.has_layout());
+
+        if (output_shape.layout().memory_space() != kHostMemorySpaceColor) {
+          return FailedPrecondition(
+              "Output buffer is annotated with %s but is not marked with host "
+              "memory space in the entry computation.",
+              custom_call->name());
+        }
+        is_used_as_output_with_host_memory_space = true;
+      }
+    }
+  }
+  if (!is_used_as_output_with_host_memory_space) {
+    VLOG(1) << "Buffer annotated by " << custom_call->name()
+            << " is not used as an output with host memory space.";
+    return false;
+  }
+
+  VLOG(3) << "Found an output buffer annotated with " << custom_call->name()
+          << ". Expecting that we'll need to insert copies.";
+
+  annotations_for_copy_to_host_to_insert_.emplace(custom_call);
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+  return true;
+}
+
 Status HostOffloader::HandleMoveToHostCustomCall(HloInstruction* custom_call) {
   VLOG(2) << "Found a custom call annotating start-of-host-offload: "
           << custom_call->ToString();
   // Save a pointer to this custom call for when we want to remove it later.
   custom_calls_to_remove_.emplace(custom_call);
 
-  // We expect that the DUS is the only user of this custom call.
-  if (custom_call->user_count() != 1) {
+  // We expect that either the custom call is the root or the DUS is the only
+  // user of this custom call.
+  if (!custom_call->IsRoot() && custom_call->user_count() != 1) {
     return FailedPrecondition(
-        "Expecting custom call %s to only have 1 user; it has %d users: [%s]",
+        "Expecting custom call %s to either be the root or only have 1 user; "
+        "it is not the root and has %d users: [%s]",
         custom_call->name(), custom_call->user_count(),
         absl::StrJoin(custom_call->users(), ", ",
                       [](std::string* out, const HloInstruction* user) {
                         out->append(user->name());
                       }));
   }
-  HloInstruction* op_being_annotated = custom_call->users()[0];
 
-  // Skip past any bitcasts.
-  while (op_being_annotated->opcode() == HloOpcode::kBitcast) {
-    VLOG(1) << "Skipping bitcast " << op_being_annotated->ToString();
-    op_being_annotated = op_being_annotated->users()[0];
+  HloInstruction* consumer = nullptr;
+  if (!custom_call->IsRoot()) {
+    consumer = custom_call->users().at(0);
+    // Skip past any bitcasts.
+    while (consumer != nullptr && consumer->opcode() == HloOpcode::kBitcast) {
+      VLOG(1) << "Skipping bitcast " << consumer->ToString();
+      consumer = consumer->users().at(0);
+    }
   }
 
-  if (op_being_annotated->opcode() == HloOpcode::kDynamicUpdateSlice) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(op_being_annotated));
-  } else if (op_being_annotated->opcode() == HloOpcode::kCopy) {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(op_being_annotated));
+  if (consumer != nullptr &&
+      consumer->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(consumer));
+  } else if (consumer != nullptr && consumer->opcode() == HloOpcode::kCopy) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(consumer));
   } else {
-    TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
+    TF_ASSIGN_OR_RETURN(bool did_output_streaming,
+                        TryOutputStreaming(custom_call));
+    if (!did_output_streaming) {
+      TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
+    }
   }
   return OkStatus();
 }
@@ -524,8 +583,31 @@ absl::StatusOr<bool> HostOffloader::TryParameterStreaming(
   HloInstruction* copy_to_device =
       custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
           copy_shape, HloOpcode::kCopy, operand_of_load_annotation));
-  TF_RETURN_IF_ERROR(
-      operand_of_load_annotation->ReplaceAllUsesWith(copy_to_device));
+
+  auto users = operand_of_load_annotation->users();
+  for (HloInstruction* use : users) {
+    if (use == copy_to_device) {
+      continue;
+    }
+    auto callers = call_graph_->GetComputationCallers(copy_to_device->parent());
+    if (callers.size() > 1) {
+      return absl::InvalidArgumentError(
+          "Expected to be called only by one caller");
+    } else if (callers.size() == 1) {
+      auto* caller = callers[0];
+      if (caller->opcode() == HloOpcode::kWhile &&
+          use->opcode() == HloOpcode::kTuple && use->IsRoot()) {
+        // Do not replace the while loop parameter with the moved data. Because
+        // of the nature of while loops, since the data started on the host, it
+        // must end on the host. Only the while loop body's root should not use
+        // copy_to_device since it's on host at the loop entry.
+        continue;
+      }
+    }
+
+    TF_RETURN_IF_ERROR(
+        operand_of_load_annotation->ReplaceUseWith(use, copy_to_device));
+  }
 
   AddAllPositionsToBeMovedToHostMemory(unique_buffer);
   return true;
@@ -573,10 +655,12 @@ absl::StatusOr<bool> HostOffloader::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
 
+  call_graph_ = CallGraph::Build(module);
+
   // Run HloAliasAnalysis on module.
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
 
-  // Iterate over all instructions and look for XLA host offload annoations.
+  // Iterate over all instructions and look for XLA host offload annotations.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
     for (HloInstruction* instruction :
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index 85966a312dc790..cd6e319a6fd50a 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -21,6 +21,7 @@
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_pass_interface.h"
 
@@ -61,12 +62,14 @@ class HostOffloader : public HloModulePass {
   absl::flat_hash_set<HloInstruction*> annotations_for_copy_to_host_to_insert_;
   absl::flat_hash_set<HloInstruction*>
       annotations_for_copy_to_device_to_insert_;
+  std::unique_ptr<CallGraph> call_graph_;
 
   // Positions of all HloValues of the given HloBuffer will be added to
   // positions_to_move_to_host_memory_.
   void AddAllPositionsToBeMovedToHostMemory(const HloBuffer& unique_buffer);
 
   absl::StatusOr<bool> TryParameterStreaming(HloInstruction* custom_call);
+  absl::StatusOr<bool> TryOutputStreaming(HloInstruction* custom_call);
   Status HandleMoveToHostCustomCall(HloInstruction* custom_call);
   Status HandleMoveToDeviceCustomCall(HloInstruction* custom_call);
 
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index 4eb459c2e60222..73579a11d1c3d4 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -47,14 +49,15 @@ class HostOffloaderTest : public HloTestBase {
  protected:
   static constexpr int64_t kHostMemorySpaceColor{5};
 
-  absl::StatusOr<bool> RunHostOffloader(HloModule* module) {
+  absl::StatusOr<bool> RunHostOffloader(HloModule* module,
+                                        bool after_layout = false) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
       return absl::InternalError("Expected a non-scheduled module");
     }
     bool changed = false;
     HostOffloadLegalize host_offload_legalize(kHostMemorySpaceColor,
-                                              /*after_layout=*/false);
+                                              after_layout);
     TF_ASSIGN_OR_RETURN(bool legal_changed, host_offload_legalize.Run(module));
     changed |= legal_changed;
     HostOffloader host_offloader(kHostMemorySpaceColor);
@@ -172,6 +175,55 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, ParameterStreamingWithXposeCopyFeedingIntoWhile) {
+  const std::string& hlo_string = R"(
+HloModule jit__prefill_impl, entry_computation_layout={(bf16[2,16,16]{2,1,0:T(8,128)(2,1)S(5)})->bf16[2,16,16]{1,2,0:T(8,128)(2,1)}}
+
+while_condition {
+  condition_param = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) parameter(0)
+  condition_current_iteration_index = s32[] get-tuple-element(condition_param), index=0
+  condition_iteration_count = s32[] constant(16)
+  ROOT condition_result = pred[] compare(condition_current_iteration_index, condition_iteration_count), direction=LT
+}
+
+while_body {
+  input_tuple.0 = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  orig_data = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=1
+  custom-call.0 = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} custom-call(orig_data), custom_call_target="MoveToDevice"
+  sum = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(input_tuple.0), index=2
+  sum.1 = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} add(custom-call.0, sum)
+
+  constant_1 = s32[] constant(1)
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1)
+  ROOT tuple_result.0 = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) tuple(incremented_index.0, custom-call.0, sum.1)
+}
+
+ENTRY main {
+  param.0 = bf16[2,16,16]{2,1,0:T(8,128)(2,1)} parameter(0)
+  copy = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} copy(param.0)
+  constant_0 = s32[] constant(0)
+  constant_0.0 = bf16[] constant(0.0)
+  broadcast = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} broadcast(constant_0.0), dimensions={}
+  tuple_for_while = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) tuple(constant_0, copy, broadcast)
+  while = (s32[], bf16[2,16,16]{1,2,0:T(8,128)(2,1)}, bf16[2,16,16]{1,2,0:T(8,128)(2,1)}) while(tuple_for_while), condition=while_condition, body=while_body
+  ROOT gte = bf16[2,16,16]{1,2,0:T(8,128)(2,1)} get-tuple-element(while), index=2
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, RunHostOffloader(module.get(), /*after_layout=*/true));
+  EXPECT_TRUE(changed);
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+  HloVerifier verifier(/*layout_sensitive=*/true,
+                       /*allow_mixed_precision=*/true);
+  TF_EXPECT_OK(verifier.Run(module.get()).status());
+  VLOG(1) << "module after: " << module->ToString();
+}
+
 TEST_F(HostOffloaderTest, BasicNoCopy) {
   const std::string& hlo_string = R"(
 HloModule my_module
@@ -1779,7 +1831,7 @@ ENTRY main {
 
 TEST_F(HostOffloaderTest, ParameterStreaming) {
   const std::string& hlo_string = R"(
-HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)S(5)})}
+HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})}
 
 ENTRY main {
   param_0 = s32[2,1]{1,0} parameter(0)
@@ -1854,6 +1906,157 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, OutputStreaming) {
+  const std::string& hlo_string = R"(
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->(s32[2,1]{1,0:T(2,128)S(5)}, s32[2,1]{1,0:T(2,128)})}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+      ROOT tuple = (s32[2,1]{1,0}, s32[2,1]{1,0}) tuple(custom_call, multiply_1)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //         constant
+  //            |
+  // param1 broadcast  param0
+  //     \  /          /
+  //   multiply       /
+  //       \         /
+  //        \       /
+  //         multiply   constant
+  //         |     |       |
+  //         |  ---+---broadcast
+  //         | /   |
+  //      multiply |
+  //          |    |
+  //         copy  |
+  //           \   |
+  //           tuple
+  HloInstruction* param_1;
+  HloInstruction* broadcast_0;
+  HloInstruction* multiply_0;
+  HloInstruction* param_0;
+  HloInstruction* multiply_1;
+  HloInstruction* broadcast_1;
+  HloInstruction* multiply_2;
+  HloInstruction* copy;
+  HloInstruction* tuple;
+  auto multiplyPattern =
+      m::Multiply(&multiply_1,
+                  m::Multiply(&multiply_0, m::Parameter(&param_1),
+                              m::Broadcast(&broadcast_0, m::ConstantScalar(2))),
+                  m::Parameter(&param_0));
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          &tuple,
+          m::Copy(&copy, m::Multiply(
+                             &multiply_2, multiplyPattern,
+                             m::Broadcast(&broadcast_1, m::ConstantScalar(4)))),
+          multiplyPattern)));
+  TestShapeHasMemorySpace(param_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          Layout::kHostMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                          Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, OutputStreamingCustomCallRoot) {
+  const std::string& hlo_string = R"(
+    HloModule ParameterStreaming, entry_computation_layout={(s32[2,1]{1,0:T(2,128)}, s32[2,1]{1,0:T(2,128)})->s32[2,1]{1,0:T(2,128)S(5)}}
+
+    ENTRY main {
+      param_0 = s32[2,1]{1,0} parameter(0)
+      param_1 = s32[2,1]{1,0} parameter(1)
+      constant_2 = s32[] constant(2)
+      constant_4 = s32[] constant(4)
+      broadcast_0 = s32[2,1]{1,0} broadcast(constant_2), dimensions={}
+      multiply_0 = s32[2,1]{1,0} multiply(param_1, broadcast_0)
+      multiply_1 = s32[2,1]{1,0} multiply(multiply_0, param_0)
+      broadcast_1 = s32[2,1]{1,0} broadcast(constant_4), dimensions={}
+      multiply_2 = s32[2,1]{1,0} multiply(multiply_1, broadcast_1)
+      ROOT custom_call = s32[2,1]{1,0} custom-call(multiply_2), custom_call_target="MoveToHost"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //         constant
+  //            |
+  // param1 broadcast  param0
+  //     \  /          /
+  //   multiply       /
+  //       \         /
+  //        \       /
+  //         multiply   constant
+  //         |             |
+  //         |  ---+---broadcast
+  //         | /
+  //      multiply
+  //          |
+  //         copy
+  HloInstruction* param_1;
+  HloInstruction* broadcast_0;
+  HloInstruction* multiply_0;
+  HloInstruction* param_0;
+  HloInstruction* multiply_1;
+  HloInstruction* broadcast_1;
+  HloInstruction* multiply_2;
+  HloInstruction* copy;
+  auto multiplyPattern =
+      m::Multiply(&multiply_1,
+                  m::Multiply(&multiply_0, m::Parameter(&param_1),
+                              m::Broadcast(&broadcast_0, m::ConstantScalar(2))),
+                  m::Parameter(&param_0));
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(
+                  &copy, m::Multiply(&multiply_2, multiplyPattern,
+                                     m::Broadcast(&broadcast_1,
+                                                  m::ConstantScalar(4))))));
+  TestShapeHasMemorySpace(param_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(param_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy->shape(), kHostMemorySpaceColor);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 44b6a064aff32b..bcc9d2e4549f4a 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -500,7 +500,7 @@ class ReversePostOrderFusionQueue : public FusionQueue {
 
 }  // namespace
 
-std::vector<HloComputation*> InstructionFusion::GetFusionComputations(
+std::vector<HloComputation*> InstructionFusion::GetNonFusionComputations(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Use sorted computations because fusion configuration is order-sensitive.
@@ -522,7 +522,8 @@ absl::StatusOr<bool> InstructionFusion::Run(
   bool dump_fusion =
       module->config().debug_options().xla_dump_fusion_visualization();
 
-  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+  for (auto* computation :
+       GetNonFusionComputations(module, execution_threads)) {
     CHECK(!computation->IsFusionComputation());
     std::unique_ptr<HloReachabilityMap> reachability =
         HloReachabilityMap::Build(computation);
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index b446a603f79e1e..8ace349f141db4 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -166,8 +166,9 @@ class InstructionFusion : public HloModulePass {
                                             const HloInstruction* consumer);
 
  protected:
-  // Returns a list of computations on which Fusion is performed.
-  virtual std::vector<HloComputation*> GetFusionComputations(
+  // Returns a list of computations that are not fusion computations. These
+  // computations contain instructions which are candidates for fusions.
+  virtual std::vector<HloComputation*> GetNonFusionComputations(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 67874e38d67a17..a6a3b9ebf94480 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -15,24 +15,23 @@ limitations under the License.
 
 #include "xla/service/layout_assignment.h"
 
-#include <algorithm>
+#include <cstdint>
 #include <deque>
-#include <functional>
 #include <map>
 #include <memory>
-#include <numeric>
 #include <ostream>
 #include <set>
 #include <string>
-#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -40,12 +39,13 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/map_util.h"
 #include "xla/permutation_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/computation_layout.h"
-#include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/tuple_points_to_analysis.h"
@@ -53,15 +53,15 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -721,11 +721,23 @@ Status LayoutAssignment::AddMandatoryConstraints(
         if (parameter_layout.LayoutIsSet()) {
           // Parameter layouts must match the respective layout in
           // ComputationLayout, if there is one.
-          TF_RETURN_IF_ERROR(
-              SetInstructionLayout(parameter_layout.shape(), instruction));
+          Shape param_shape = parameter_layout.shape();
+          // Clear out memory space in layout. Host offloader will do the
+          // analysis later.
+          TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
+              &param_shape, [](Shape* subshape, const ShapeIndex& index) {
+                if (!subshape->has_layout() || !subshape->IsArray()) {
+                  return OkStatus();
+                }
+                subshape->mutable_layout()->set_memory_space(
+                    Layout::kDefaultMemorySpace);
+                return OkStatus();
+              }));
+
+          TF_RETURN_IF_ERROR(SetInstructionLayout(param_shape, instruction));
           if (reverse_computation_order_) {
             TF_RETURN_IF_ERROR(PropagateParameterLayoutToUsers(
-                instruction, parameter_layout.shape(), this));
+                instruction, param_shape, this));
           }
         }
       }
@@ -794,14 +806,17 @@ Status LayoutAssignment::AddMandatoryConstraints(
       const ComputationLayout& called_computation_layout =
           FindOrDie(computation_layouts_, instruction->to_apply())
               ->computation_layout();
-      TF_RETURN_IF_ERROR(SetInstructionLayout(
-          called_computation_layout.result_layout().shape(), instruction));
+      auto result_shape = UnShardedShape(
+          instruction, called_computation_layout.result_layout().shape(), -1);
+      TF_RETURN_IF_ERROR(SetInstructionLayout(result_shape, instruction));
       TF_RET_CHECK(instruction->operand_count() ==
                    called_computation_layout.parameter_count());
       for (int64_t i = 0; i < instruction->operand_count(); ++i) {
-        TF_RETURN_IF_ERROR(SetOperandLayout(
-            called_computation_layout.parameter_layout(i).shape(), instruction,
-            i, /*mandatory=*/true, /*dfs=*/true));
+        auto operand_shape = UnShardedShape(
+            instruction, called_computation_layout.parameter_layout(i).shape(),
+            i);
+        TF_RETURN_IF_ERROR(SetOperandLayout(operand_shape, instruction, i,
+                                            /*mandatory=*/true, /*dfs=*/true));
       }
     } else if (instruction->opcode() == HloOpcode::kWhile &&
                computation_layouts_.find(instruction->while_body()) !=
@@ -951,22 +966,6 @@ bool LayoutsInShapesEqual(const Shape& lhs, const Shape& rhs) {
   return Layout::Equal().MinorToMajorOnly()(lhs.layout(), rhs.layout());
 }
 
-// The operands of a call must match the layouts of parameters in the
-// ComputationLayout, and the call instruction itself must match the result
-// layout in the ComputationLayout.
-Status CheckCallLayout(HloInstruction* call,
-                       const ComputationLayout& computation_layout) {
-  HloComputation* computation = call->to_apply();
-  TF_RET_CHECK(computation->num_parameters() == call->operand_count());
-  for (int64_t i = 0; i < computation->num_parameters(); ++i) {
-    TF_RET_CHECK(computation_layout.parameter_layout(i).MatchesLayoutInShape(
-        call->operand(i)->shape(), /*minor_to_major_only=*/true));
-  }
-  TF_RET_CHECK(computation_layout.result_layout().MatchesLayoutInShape(
-      call->shape(), /*minor_to_major_only=*/true));
-  return OkStatus();
-}
-
 // Operands of layout-constrained custom calls must match the expected
 // constrained layouts.
 Status CheckCustomCallLayout(HloInstruction* instruction) {
@@ -1114,6 +1113,20 @@ Status CheckBroadcastLayout(HloInstruction* broadcast) {
 
 }  // namespace
 
+Status LayoutAssignment::CheckCallLayout(
+    HloInstruction* call, const ComputationLayout& computation_layout) {
+  HloComputation* computation = call->to_apply();
+  TF_RET_CHECK(computation->num_parameters() == call->operand_count());
+  for (int64_t i = 0; i < computation->num_parameters(); ++i) {
+    TF_RET_CHECK(computation_layout.parameter_layout(i).MatchesLayoutInShape(
+        ShardedShape(call, call->operand(i)->shape(), i),
+        /*minor_to_major_only=*/true));
+  }
+  TF_RET_CHECK(computation_layout.result_layout().MatchesLayoutInShape(
+      ShardedShape(call, call->shape(), -1), /*minor_to_major_only=*/true));
+  return OkStatus();
+}
+
 absl::StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     const Shape& shape_with_layout, HloInstruction* instruction) {
   TF_RET_CHECK(LayoutUtil::HasLayout(shape_with_layout));
@@ -2004,9 +2017,14 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
       VLOG(3) << "Propagating layout through backedge"
               << buffer_constraint.layout().ToString();
       int64_t index = user->operand_index(buffer.instruction());
-      TF_ASSIGN_OR_RETURN(
-          auto buffer, points_to_analysis_->GetBufferDefinedAt(
-                           user->parent()->parameter_instruction(0), {index}));
+
+      const HloInstruction* inputs = user->parent()->parameter_instruction(0);
+
+      ShapeIndex used_index = buffer.index();
+      used_index.push_front(index);
+
+      TF_ASSIGN_OR_RETURN(auto buffer, points_to_analysis_->GetBufferDefinedAt(
+                                           inputs, used_index));
 
       TF_RETURN_IF_ERROR(SetBufferLayout(buffer_constraint.layout(), *buffer,
                                          /*mandatory=*/false));
@@ -2019,12 +2037,28 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
 Status LayoutAssignment::PropagateResultConstraint(
     const ComputationLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
+  ShapeLayout result_layout =
+      layout_constraint.computation_layout().result_layout();
+  // Clear out memory space in layout for entry computation root. Host offloader
+  // will do the analysis later and add back the memory space for host outputs.
+  if (constraints->computation()->IsEntryComputation()) {
+    Shape result_shape = result_layout.shape();
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
+        &result_shape, [](Shape* subshape, const ShapeIndex& shape_index) {
+          if (subshape->has_layout() && subshape->IsArray()) {
+            subshape->mutable_layout()->set_memory_space(
+                Layout::kDefaultMemorySpace);
+          }
+          return OkStatus();
+        }));
+    TF_RETURN_IF_ERROR(result_layout.CopyLayoutFromShape(result_shape));
+  }
+
   // Propagate the use constraint of the root instruction up to the logical
   // buffers which make up the result.
   return PropagateUseConstraintToDefs(
-      layout_constraint.computation_layout().result_layout(),
-      constraints->computation()->root_instruction(), constraints,
-      current_priority_);
+      result_layout, constraints->computation()->root_instruction(),
+      constraints, current_priority_);
 }
 
 // Infers the layout of the array at the given index in the given instruction's
@@ -2516,7 +2550,8 @@ Status LayoutAssignment::PropagateComputationLayouts(
           }
           const auto& computed_subshape = ShapeUtil::GetSubshape(
               computed_computation_layout.parameter_shape(i), shape_index);
-          if (subshape.layout() != computed_subshape.layout()) {
+          if (!Layout::Equal().IgnoreMemorySpace()(
+                  subshape.layout(), computed_subshape.layout())) {
             return Internal(
                 "Assigned parameter shape %s does not match layout of "
                 "computation shape: %s",
diff --git a/third_party/xla/xla/service/layout_assignment.h b/third_party/xla/xla/service/layout_assignment.h
index b855b315acd75f..22586493917105 100644
--- a/third_party/xla/xla/service/layout_assignment.h
+++ b/third_party/xla/xla/service/layout_assignment.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
 #define XLA_SERVICE_LAYOUT_ASSIGNMENT_H_
 
+#include <cstdint>
 #include <iosfwd>
 #include <map>
 #include <memory>
@@ -28,19 +29,26 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/layout_util.h"
+#include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/tuple_points_to_analysis.h"
+#include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/types.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
 
@@ -117,7 +125,7 @@ class OperandLayoutConstraint : public LayoutConstraint {
 
   const ShapeLayout& shape_layout() const { return shape_layout_[0]; }
   const HloInstruction* instruction() const { return instruction_; }
-  const int64_t operand_no() const { return operand_no_; }
+  int64_t operand_no() const { return operand_no_; }
   const HloInstruction* operand() const {
     return instruction_->operand(operand_no_);
   }
@@ -196,7 +204,7 @@ class ComputationLayoutConstraint : public LayoutConstraint {
 class ChannelLayoutConstraints {
  public:
   // Construct an empty constraint set.
-  ChannelLayoutConstraints() {}
+  ChannelLayoutConstraints() = default;
 
   // Returns true if channel_id has a layout constraint.
   bool IsChannelConstrained(int64_t channel_id) const {
@@ -516,6 +524,31 @@ class LayoutAssignment : public HloModulePass {
   virtual bool InstructionCanChangeLayoutInstance(
       const HloInstruction* instruction);
 
+  // The shapes in caller can be different from the shapes in callee. For
+  // example, a shape (1024, 128) of an array can be distributed to four threads
+  // so the shape for each thread is (256, 128). When verifying the callee's
+  // shapes based on the caller, we should use this function to compute the
+  // expected shape. The param_id should be the parameter id of the shape or -1
+  // for the result output or unknown.
+  virtual Shape ShardedShape(const HloInstruction* call, const Shape& shape,
+                             int param_id) {
+    return shape;
+  }
+  // When verifying the caller's shapes based on the callee, we should use this
+  // function to compute the expected shape.
+  // The param_id should be the parameter id of the shape or -1 for the result
+  // output or unknown.
+  virtual Shape UnShardedShape(const HloInstruction* call, const Shape& shape,
+                               int param_id) {
+    return shape;
+  }
+
+  // The operands of a call must match the layouts of parameters in the
+  // ComputationLayout, and the call instruction itself must match the result
+  // layout in the ComputationLayout.
+  Status CheckCallLayout(HloInstruction* call,
+                         const ComputationLayout& computation_layout);
+
  private:
   // Initializes the layout assignment object for a new Run() call.
   Status Init(HloModule* module);
diff --git a/third_party/xla/xla/service/layout_assignment_test.cc b/third_party/xla/xla/service/layout_assignment_test.cc
index 9332f9390f43e3..9a0766e8bc92e4 100644
--- a/third_party/xla/xla/service/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/layout_assignment_test.cc
@@ -1612,7 +1612,7 @@ ENTRY main {
 TEST_F(LayoutAssignmentTest, PropagateOperandLayout2) {
   const char* module_str = R"(
  HloModule TensorFlowGather, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->f32[16,1,18,32]{3,1,2,0}}
- 
+
  ENTRY %main (operand: f32[32,650], indices: s32[16,1,18]) -> f32[16,1,18,32] {
    %operand = f32[32,650]{1,0} parameter(0)
    %transpose = f32[650,32]{0,1} transpose(f32[32,650]{1,0} %operand), dimensions={1,0}
@@ -1638,7 +1638,7 @@ TEST_F(LayoutAssignmentTest, PropagateOperandLayout2) {
 TEST_F(LayoutAssignmentTest, PreserveInstructionLayout) {
   const char* module_str = R"(
  HloModule TensorFlowGather, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->(f32[16,1,18,32]{3,1,2,0})}
- 
+
  ENTRY %main  {
    %operand = f32[32,650]{1,0} parameter(0)
    %transpose = f32[650,32]{0,1} transpose(f32[32,650]{1,0} %operand), dimensions={1,0}
@@ -1697,7 +1697,7 @@ ENTRY main {
 TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
   const char* module_str = R"(
  HloModule EntryLayout, entry_computation_layout={(f32[32,650]{1,0},s32[16,1,18]{0,1,2})->(f32[650,32]{1,0},s32[18,16,1]{0,1,2})}
- 
+
  ENTRY %main {
    operand = f32[32,650] parameter(0)
    transpose = transpose(operand), dimensions={1,0}
@@ -1726,7 +1726,7 @@ TEST_F(LayoutAssignmentTest, PartialEntryParameterLayout) {
 TEST_F(LayoutAssignmentTest, AliasParameterAndOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1752,7 +1752,7 @@ TEST_F(LayoutAssignmentTest, AliasParameterAndOutput) {
 TEST_F(LayoutAssignmentTest, AliasUnconstrainedParamterWithConstrainedOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1776,7 +1776,7 @@ TEST_F(LayoutAssignmentTest, AliasUnconstrainedParamterWithConstrainedOutput) {
 TEST_F(LayoutAssignmentTest, AliasConstrainedParamterWithUnconstrainedOutput) {
   const char* module_str = R"(
  HloModule EntryAlias, input_output_alias={ {}: (0, {}, may-alias) }
- 
+
  ENTRY %main {
    p0 = f32[65,65] parameter(0)
    p1 = f32[4225] parameter(1)
@@ -1797,5 +1797,57 @@ TEST_F(LayoutAssignmentTest, AliasConstrainedParamterWithUnconstrainedOutput) {
             m->entry_computation_layout().parameter_layout(0).shape());
 }
 
+TEST_F(LayoutAssignmentTest, NestedTupleInLoop) {
+  const char* module_str = R"(
+HloModule Module
+
+condition  {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+    ROOT lt = pred[] constant(1)
+}
+
+body {
+    p = (f32[100,100], (f32[100,100], u32[], token[])) parameter(0)
+
+    t1 = f32[100,100] get-tuple-element(p), index=0
+    t = (f32[100,100], u32[], token[]) get-tuple-element(p), index=1
+    sdone = token[] send-done(t), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_pipeline="0"
+      }
+    tk = token[] after-all()
+    snd = (f32[100,100], u32[], token[]) send(t1, tk), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    a = add(t1, t1)
+    ROOT tup =  tuple(a, snd)
+}
+
+ENTRY %main {
+    p0 = f32[100,100] parameter(0)
+    tk = token[] after-all()
+    snd = (f32[100,100], u32[], token[]) send(p0, tk), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    t = tuple(p0, snd)
+    loop = while(t), condition=condition, body=body
+    ssend = (f32[100,100], u32[], token[]) get-tuple-element(loop), index=1
+    sdone = token[] send-done(ssend), channel_id=1,
+      frontend_attributes={
+       _xla_send_recv_pipeline="0"
+      }
+    ROOT result = f32[100,100] get-tuple-element(loop), index=0
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
+
+  LayoutAssignment layout_assignment(m->mutable_entry_computation_layout(),
+                                     nullptr);
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index a706957cb6f960..068c4ea8d98709 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 1ffb2c1655a73a..9e95c1e61e3b53 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -1,16 +1,16 @@
 # Description:
 #   Memory Space Assignment service implementation.
 
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
 )
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
index 54cf00f9a31f39..49808709832f50 100644
--- a/third_party/xla/xla/service/memory_space_assignment/allocation.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -201,7 +201,7 @@ Allocation::Allocation(HloPosition defining_position, MemorySpace memory_space,
                        std::optional<int64_t> cross_program_prefetch_index)
     : original_defining_position_(std::move(defining_position)),
       memory_space_(memory_space),
-      chunk_(std::move(chunk)),
+      chunk_(chunk),
       start_time_(start_time),
       end_time_(end_time),
       is_scoped_allocation_(is_scoped_allocation),
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
index eb8e1378114fa8..e99e4ed085c289 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.cc
@@ -495,7 +495,7 @@ class BestFitRepacker
     LOG(FATAL) << "We should never get here.";
   }
 
-  StatusOr<Result> Finish() override {
+  absl::StatusOr<Result> Finish() override {
     std::vector<BufferInterval> sorted_buffer_intervals =
         GetSortedBufferIntervals();
 
@@ -632,7 +632,7 @@ class BestFitRepacker
 
 namespace memory_space_assignment {
 
-StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
+absl::StatusOr<bool> MemorySpaceAssignmentBestFitRepacker::Repack(
     absl::Span<AllocationBlock*> allocations) {
   BestFitRepacker best_fit_repacker = BestFitRepacker(
       options_, slice_time_permutation_iterator_type_, max_size_, alignment_);
diff --git a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
index dacb7803d303cb..816031d383359f 100644
--- a/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
+++ b/third_party/xla/xla/service/memory_space_assignment/best_fit_repacker.h
@@ -62,7 +62,8 @@ class MemorySpaceAssignmentBestFitRepacker
         slice_time_permutation_iterator_type_(
             slice_time_permutation_iterator_type) {}
 
-  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override;
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override;
 
  private:
   BestFitRepackOptions options_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
index 2e5c07ac7546c1..72f1c226e9652c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
@@ -43,7 +43,7 @@ limitations under the License.
 
 namespace xla {
 namespace memory_space_assignment {
-/*static*/ StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
+/*static*/ absl::StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
     const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
     const HloModule& module) {
   TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
index 8463f7f914564b..b8152dbc23fda7 100644
--- a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
@@ -84,7 +84,7 @@ class CostAnalysis {
 
   virtual ~CostAnalysis() = default;
 
-  static StatusOr<std::unique_ptr<CostAnalysis>> Create(
+  static absl::StatusOr<std::unique_ptr<CostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
       const HloModule& module);
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 4768f5d8261d33..022591adf1e6a1 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -343,7 +343,8 @@ Status InsertInstructionAndEnsureOperandsInserted(
   return OkStatus();
 }
 
-StatusOr<xla::HloLiveRange::LogicalTime> GetScheduleTimeFromInstructionName(
+absl::StatusOr<xla::HloLiveRange::LogicalTime>
+GetScheduleTimeFromInstructionName(
     absl::string_view name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -381,7 +382,7 @@ bool DoesOperandMatchFilter(const HloOperandFilter& filter,
   return true;
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
     float prefetch_eagerness, int64_t earliest_prefetch_time,
     int64_t latest_prefetch_time) {
   CHECK_GE(prefetch_eagerness, 0.0);
@@ -394,7 +395,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeByEagerness(
       (latest_prefetch_time - earliest_prefetch_time) * prefetch_eagerness);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
     const std::string& after_instruction_name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -404,7 +405,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeAfterInstruction(
   return static_cast<std::optional<int64_t>>(reference_instruction_time);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
     const std::string& before_instruction_name,
     const absl::flat_hash_map<const xla::HloInstruction*,
                               xla::HloLiveRange::LogicalTime>& schedule) {
@@ -414,7 +415,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTimeBeforeInstruction(
   return static_cast<std::optional<int64_t>>(reference_instruction_time - 1);
 }
 
-StatusOr<std::optional<int64_t>> GetPrefetchTime(
+absl::StatusOr<std::optional<int64_t>> GetPrefetchTime(
     const PreferredPrefetchOverrideOptions& override_options,
     int64_t earliest_prefetch_time, int64_t latest_prefetch_time,
     const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
@@ -436,7 +437,7 @@ StatusOr<std::optional<int64_t>> GetPrefetchTime(
   return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
 }
 
-StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
+absl::StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
     const PreferredPrefetchOverrides& preferred_prefetch_overrides,
     int64_t operand_size, const HloUse& hlo_use,
     const absl::flat_hash_map<const HloInstruction*, HloLiveRange::LogicalTime>&
@@ -1338,11 +1339,13 @@ std::function<int(const HloInstruction*)> GetOperandDistanceFunction(
     const HloLiveRange& hlo_live_range, const HloInstruction* use_inst) {
   const int use_idx = hlo_live_range.instruction_schedule().at(use_inst);
   return [&, use_idx](const HloInstruction* operand) -> int {
-    // We just use -1 for parameter, tuple, and gte instructions. We could make
-    // this "see through" the gtes if we get too many false positives.
+    // We just use -1 for parameter, tuple, gte and constant instructions. We
+    // could make this "see through" the gtes if we get too many false
+    // positives.
     if (operand->opcode() == HloOpcode::kParameter ||
         operand->opcode() == HloOpcode::kTuple ||
-        operand->opcode() == HloOpcode::kGetTupleElement) {
+        operand->opcode() == HloOpcode::kGetTupleElement ||
+        operand->opcode() == HloOpcode::kConstant) {
       return -1;
     }
     return use_idx - hlo_live_range.instruction_schedule().at(operand);
@@ -1568,7 +1571,8 @@ void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
   }
 }
 
-StatusOr<HeapSimulator::Result<HloValue>> AlternateMemoryBestFitHeap::Finish() {
+absl::StatusOr<HeapSimulator::Result<HloValue>>
+AlternateMemoryBestFitHeap::Finish() {
   if (options_.autotuning_config.has_value()) {
     CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
   }
@@ -2140,7 +2144,7 @@ void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
   FindAliases(&allocation_values);
 }
 
-StatusOr<AlternateMemoryBestFitHeap::Result>
+absl::StatusOr<AlternateMemoryBestFitHeap::Result>
 AlternateMemoryBestFitHeap::AllocateAllocationValues(
     absl::Span<MemorySpaceAssignment::AllocationValue> allocation_values) {
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
@@ -5329,7 +5333,7 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidates(
   return {};
 }
 
-StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+absl::StatusOr<MemorySpaceAssignment::AsyncCopyStats>
 MemorySpaceAssignment::CalculateAsyncCopyStats() const {
   AsyncCopyStats stats;
   int64_t current_copies = 0;
@@ -5373,7 +5377,7 @@ MemorySpaceAssignment::CalculateAsyncCopyStats() const {
   return stats;
 }
 
-/*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
+/*static*/ absl::StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::Run(HloModule* module,
                            const HloLiveRange& hlo_live_range,
                            const HloAliasAnalysis& alias_analysis,
@@ -5389,7 +5393,7 @@ MemorySpaceAssignment::Run(HloModule* module,
                                                           alias_analysis);
 }
 
-StatusOr<std::unique_ptr<PresetAssignments>>
+absl::StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::RunMemorySpaceAssignment(
     const HloLiveRange& hlo_live_range,
     const HloAliasAnalysis& alias_analysis) {
@@ -6006,7 +6010,7 @@ Status MemorySpaceAssignment::FixSchedule() {
 
     VLOG(4) << "Scheduling: " << computation->ToString();
 
-    for (int64_t instruction_index = 0;; ++instruction_index) {
+    for (int64_t instruction_index = -1;; ++instruction_index) {
       auto insts_before_iter = schedule_before_.find(instruction_index);
       if (insts_before_iter != schedule_before_.end()) {
         for (HloInstruction* new_instruction : insts_before_iter->second) {
@@ -6018,25 +6022,32 @@ Status MemorySpaceAssignment::FixSchedule() {
           }
         }
       }
-      // We allow scheduling copy dones past the root instruction (for
-      // end-of-program cross-program prefetch). So the loop exit condition is
-      // actually here.
-      if (instruction_index >= flattened_instructions_.size()) {
-        break;
-      }
-      HloInstruction* instruction = flattened_instructions_[instruction_index];
-      // Insert only if it is not deleted (SimplifyGraph sets it to nullptr if
-      // it was deleted) and not previously inserted. Also bitcasts and tuples
-      // are treated specially and only inserted as a result of operand
-      // dependencies.
-      if (instruction != nullptr && instruction->parent() == computation &&
-          instruction->opcode() != HloOpcode::kBitcast &&
-          instruction->opcode() != HloOpcode::kTuple &&
-          !inserted_instructions.contains(instruction)) {
-        VLOG(4) << "inst " << instruction_index << ": " << instruction->name();
-        TF_RETURN_IF_ERROR(InsertInstructionAndEnsureOperandsInserted(
-            instruction, &new_sequence, &inserted_instructions));
+
+      if (instruction_index != -1) {
+        // We allow scheduling copy dones past the root instruction (for
+        // end-of-program cross-program prefetch). So the loop exit condition is
+        // actually here.
+        if (instruction_index >= flattened_instructions_.size()) {
+          break;
+        }
+
+        HloInstruction* instruction =
+            flattened_instructions_[instruction_index];
+        // Insert only if it is not deleted (SimplifyGraph sets it to nullptr if
+        // it was deleted) and not previously inserted. Also bitcasts and tuples
+        // are treated specially and only inserted as a result of operand
+        // dependencies.
+        if (instruction != nullptr && instruction->parent() == computation &&
+            instruction->opcode() != HloOpcode::kBitcast &&
+            instruction->opcode() != HloOpcode::kTuple &&
+            !inserted_instructions.contains(instruction)) {
+          VLOG(4) << "inst " << instruction_index << ": "
+                  << instruction->name();
+          TF_RETURN_IF_ERROR(InsertInstructionAndEnsureOperandsInserted(
+              instruction, &new_sequence, &inserted_instructions));
+        }
       }
+
       auto insts_after_iter = schedule_after_.find(instruction_index);
       if (insts_after_iter != schedule_after_.end()) {
         for (HloInstruction* new_instruction : insts_after_iter->second) {
@@ -6049,6 +6060,7 @@ Status MemorySpaceAssignment::FixSchedule() {
         }
       }
     }
+
     // For rare cases where the original sequence is empty, ensure the root
     // instruction and its dependencies are scheduled.
     TF_RETURN_IF_ERROR(EnsureInstructionAndOperandsInserted(
@@ -6322,10 +6334,9 @@ DefaultCrossProgramPrefetchBufferIntervalComparator::GetTuple(
       sort_data.cumulative_use_size +=
           ShapeUtil::ElementsInRecursive(use.instruction->shape());
     });
-    sort_data_it = additional_sort_data_
-                       .insert(std::make_pair(buffer_interval.buffer,
-                                              std::move(sort_data)))
-                       .first;
+    sort_data_it =
+        additional_sort_data_.try_emplace(buffer_interval.buffer, sort_data)
+            .first;
   }
 
   return std::make_tuple(
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index 72100f56b1425b..e262349fbb5cbd 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -477,12 +477,12 @@ class MemorySpaceAssignment {
   virtual ~MemorySpaceAssignment() = default;
 
   // Runs the MemorySpaceAssignment pass.
-  static StatusOr<std::unique_ptr<PresetAssignments>> Run(
+  static absl::StatusOr<std::unique_ptr<PresetAssignments>> Run(
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
   // Calculates asynchronous copy statistics.
-  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
+  absl::StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   // Verify that the memory space assignment is free of overlapping buffers and
   // export heap simulator trace to be used by buffer_assignment.
@@ -490,9 +490,9 @@ class MemorySpaceAssignment {
 
  protected:
   // Main driver of the memory space assignment pass.
-  virtual StatusOr<std::unique_ptr<PresetAssignments>> RunMemorySpaceAssignment(
-      const HloLiveRange& hlo_live_range,
-      const HloAliasAnalysis& alias_analysis);
+  virtual absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  RunMemorySpaceAssignment(const HloLiveRange& hlo_live_range,
+                           const HloAliasAnalysis& alias_analysis);
 
   // Finds an AllocationSequence for placing buffers in alternate memory using
   // the AlternateMemoryBestFitHeap algorithm. Must be set before Process() is
@@ -832,7 +832,7 @@ class AlternateMemoryBestFitHeap
   void AllocateCrossProgramPrefetchBuffer(
       HloModule* module, const BufferInterval& prefetch_candidate);
 
-  StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
+  absl::StatusOr<HeapSimulator::Result<HloValue>> Finish() override;
 
  protected:
   // Given a buffer interval, returns the colocated intervals. Unlike the
@@ -1166,7 +1166,7 @@ class AlternateMemoryBestFitHeap
   // All of the allocation values have a must-alias relationship with each
   // other. Returns either kSuccess if all of the sites could be placed in the
   // alternate memory or a bitwise OR of failure reasons why they couldn't
-  StatusOr<Result> AllocateAllocationValues(
+  absl::StatusOr<Result> AllocateAllocationValues(
       absl::Span<AllocationValue> allocation_values);
 
   // Finds an allocation for an allocation request for a segment (see the
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 172f565f16975f..5c9cc1e1321ba8 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -274,7 +274,8 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return std::move(status_or.value());
   }
 
-  StatusOr<std::unique_ptr<PresetAssignments>> AssignMemorySpaceAndReturnStatus(
+  absl::StatusOr<std::unique_ptr<PresetAssignments>>
+  AssignMemorySpaceAndReturnStatus(
       HloModule* module, std::optional<Options> options_override,
       std::optional<MsaBufferIntervalCompare> buffer_interval_compare,
       PrefetchIntervalPicker* prefetch_interval_picker) {
@@ -6696,7 +6697,8 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
         check_fun_(check_fun),
         always_return_modified_(always_return_modified) {}
 
-  StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
+  absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) override {
     bool modified = false;
     for (AllocationBlock* block : allocations) {
       absl::flat_hash_set<int64_t> colocations;
@@ -7884,6 +7886,66 @@ ENTRY main {
       kAlternateMemorySpace);
 }
 
+// This test seeks to test that MSA will schedule async copy operations with
+// schedule_after=-1 at the very beginning of the program.
+//
+// The machinery for this is a little opaque from the public API, so we attempt
+// to get MSA to self-assign an async copies with schedule_after=-1 by
+// exploiting how the hidden algorithm works. This is brittle and subject to
+// inadvertent breakage in the future.
+TEST_P(MemorySpaceAssignmentTest, HoistCopyStart) {
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true
+
+  ENTRY cross_program_prefetch {
+    p0 = (f32[8,8]{1,0}, f32[8,2]{1,0}) parameter(0)
+    get-tuple-element.0 = f32[8,8]{1,0} get-tuple-element(p0), index=0
+    add.0 = f32[8,8]{1,0} add(get-tuple-element.0, get-tuple-element.0)
+    get-tuple-element.1 = f32[8,2]{1,0} get-tuple-element(p0), index=1
+    dot.0 = f32[8,2]{1,0} dot(add.0, get-tuple-element.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    negate.1 = f32[8,2]{1,0} negate(dot.0)
+    negate.2 = f32[8,2]{1,0} negate(negate.1)
+    negate.3 = f32[8,2]{1,0} negate(negate.2)
+    negate.4 = f32[8,2]{1,0} negate(negate.3)
+    negate.5 = f32[8,2]{1,0} negate(negate.4)
+    negate.6 = f32[8,2]{1,0} negate(negate.5)
+    negate.7 = f32[8,2]{1,0} negate(negate.6)
+    negate.8 = f32[8,2]{1,0} negate(negate.7)
+    ROOT dot.1 = f32[2,2]{1,0} dot(negate.8, get-tuple-element.1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  Options options = DefaultMemorySpaceOptions();
+  options.enable_cross_program_prefetch = true;
+  AssignMemorySpace(module.get(), options);
+
+  // Ensure that get-tuple-element.1 is chosen for cross-program prefetch.
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  ASSERT_EQ(cross_program_prefetches.size(), 1);
+  ASSERT_EQ(cross_program_prefetches[0].parameter, 0);
+  ASSERT_EQ(cross_program_prefetches[0].index, ShapeIndex({1}));
+
+  // Check that the async copy-start for get-tuple-element.1 is hoisted
+  // after MSA (get-tuple-element.1 was initially the third operation of the
+  // original schedule).
+  //
+  // We expect the only instructions before it are declaring parameter(0) and
+  // get-tuple-element.1.
+  for (auto* instruction : module->schedule()
+                               .sequence(module->entry_computation())
+                               .instructions()) {
+    auto p0 = op::Parameter(0);
+    auto get_tuple_element_1 = op::GetTupleElement(p0, 1);
+    auto copy_start = op::CopyStart(get_tuple_element_1);
+    EXPECT_THAT(instruction, AnyOf(p0, get_tuple_element_1, copy_start));
+    if (::testing::Matches(copy_start)(instruction)) {
+      EXPECT_TRUE(instruction->cross_program_prefetch_index().has_value());
+      break;
+    }
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(MemorySpaceAssignmentInstantiation,
                          MemorySpaceAssignmentTest,
                          ::testing::Values(false, true));
@@ -9833,13 +9895,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     SliceProposer() = default;
     virtual ~SliceProposer() = default;
 
-    virtual StatusOr<SliceProposalCollection> ProposeSlices(
+    virtual absl::StatusOr<SliceProposalCollection> ProposeSlices(
         const Shape& shape, const SlicedPrefetchOptions& options) = 0;
   };
 
   class MockSliceProposer : public SliceProposer {
    public:
-    MOCK_METHOD(StatusOr<SliceProposalCollection>, ProposeSlices,
+    MOCK_METHOD(absl::StatusOr<SliceProposalCollection>, ProposeSlices,
                 (const Shape& shape, const SlicedPrefetchOptions& options),
                 (override));
   };
@@ -10197,7 +10259,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   }
 
   // Returns the index of the first instruction with the given name.
-  static StatusOr<int> FindScheduleIndexOfInstruction(
+  static absl::StatusOr<int> FindScheduleIndexOfInstruction(
       const std::vector<HloInstruction*>& schedule, std::string_view name,
       InstructionClass c) {
     for (int i = 0; i < schedule.size(); ++i) {
@@ -10223,7 +10285,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     return nullptr;
   }
 
-  static StatusOr<std::vector<int>> GetSliceStartIndicies(
+  static absl::StatusOr<std::vector<int>> GetSliceStartIndicies(
       const std::vector<HloInstruction*>& schedule,
       const HloInstruction* concat_bitcast) {
     std::vector<int> indicies;
@@ -10923,7 +10985,7 @@ ENTRY main {
   EXPECT_CALL(slice_proposer_,
               ProposeSlices(f32_8_8_, EqualsSlicedPrefetchOptions(
                                           options_.sliced_prefetch_options)))
-      .WillRepeatedly(Return(StatusOr<SliceProposalCollection>(
+      .WillRepeatedly(Return(absl::StatusOr<SliceProposalCollection>(
           FailedPrecondition("%s", "Cannot slice."))));
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
@@ -11116,7 +11178,7 @@ class MockRepacker : public MemorySpaceAssignmentRepacker {
   MockRepacker()
       : MemorySpaceAssignmentRepacker(std::numeric_limits<int64_t>::max(), 1) {}
 
-  MOCK_METHOD(StatusOr<bool>, Repack, (absl::Span<AllocationBlock*>),
+  MOCK_METHOD(absl::StatusOr<bool>, Repack, (absl::Span<AllocationBlock*>),
               (override));
 };
 
@@ -11240,7 +11302,7 @@ ENTRY main {
   absl::flat_hash_map<std::pair<int64_t, int64_t>, int64_t> repack_map;
   EXPECT_CALL(repacker, Repack(_))
       .WillRepeatedly([](absl::Span<AllocationBlock*> allocations)
-                          -> StatusOr<bool> {
+                          -> absl::StatusOr<bool> {
         bool found_p2 = false;
         bool found_p3 = false;
         for (AllocationBlock* block : allocations) {
@@ -11461,7 +11523,7 @@ ENTRY main {
   // Define a lambda for running MSA on the specified HLO, with the
   // configuration above.
   auto run_msa =
-      [&](std::string_view hlo_text) -> StatusOr<ModuleAndAssignments> {
+      [&](std::string_view hlo_text) -> absl::StatusOr<ModuleAndAssignments> {
     ModuleAndAssignments module_and_assignments;
     TF_ASSIGN_OR_RETURN(module_and_assignments.module,
                         ParseAndReturnVerifiedModule(hlo_text));
diff --git a/third_party/xla/xla/service/memory_space_assignment/repacking.h b/third_party/xla/xla/service/memory_space_assignment/repacking.h
index 8100f78d365df8..095fd8ded056ed 100644
--- a/third_party/xla/xla/service/memory_space_assignment/repacking.h
+++ b/third_party/xla/xla/service/memory_space_assignment/repacking.h
@@ -35,7 +35,8 @@ class MemorySpaceAssignmentRepacker {
   // Repack the AllocationBlocks provided in the parameter. Returns true if
   // allocations have been modified and false if not. Returns a non-ok status if
   // there was an error.
-  virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
+  virtual absl::StatusOr<bool> Repack(
+      absl::Span<AllocationBlock*> allocations) = 0;
 
  protected:
   int64_t max_size_;
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.h b/third_party/xla/xla/service/memory_space_assignment/slice.h
index ca67dd29faf7b1..3d1fe279e36fe3 100644
--- a/third_party/xla/xla/service/memory_space_assignment/slice.h
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.h
@@ -95,8 +95,9 @@ struct SliceProposal {
 // A SliceProposalCollection is generated from a SliceProposalFunction and is
 // used when we want to slice a prefetch.
 using SliceProposalCollection = std::vector<SliceProposal>;
-using SliceProposalFunction = std::function<StatusOr<SliceProposalCollection>(
-    const Shape& shape, const SlicedPrefetchOptions& options)>;
+using SliceProposalFunction =
+    std::function<absl::StatusOr<SliceProposalCollection>(
+        const Shape& shape, const SlicedPrefetchOptions& options)>;
 
 // A SliceDecision is a SliceProposal that we've determined where and when to
 // allocate.
diff --git a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
index b1d3c94cc0f421..ccea37b88b470c 100644
--- a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
+++ b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
@@ -41,7 +41,7 @@ namespace memory_space_assignment {
 // elapsed times of each HLO and asynchronous copy.
 class FakeCostAnalysis : public CostAnalysis {
  public:
-  static StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
+  static absl::StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
       const HloCostAnalysis& cost_analysis, const HloModule& module,
       const CostAnalysisOptions& options) {
     TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
diff --git a/third_party/xla/xla/service/op_expander_pass.cc b/third_party/xla/xla/service/op_expander_pass.cc
index 11f4dfb5c42330..318211dce1f08a 100644
--- a/third_party/xla/xla/service/op_expander_pass.cc
+++ b/third_party/xla/xla/service/op_expander_pass.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "xla/service/op_expander_pass.h"
 
-#include <utility>
+#include <iterator>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/hlo_creation_utils.h"
-#include "xla/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -45,7 +49,11 @@ absl::StatusOr<bool> OpExpanderPass::Run(
     if (expanded_root == nullptr) {
       continue;
     }
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+    TF_ASSIGN_OR_RETURN(bool changed,
+                        inst->parent()->ReplaceInstruction(
+                            inst, expanded_root, preserve_sharding_,
+                            relay_control_dependency_));
+    DCHECK(changed);
   }
 
   return !matching_instructions.empty();
diff --git a/third_party/xla/xla/service/op_expander_pass.h b/third_party/xla/xla/service/op_expander_pass.h
index bac8e736928fc7..c86c3a44f55633 100644
--- a/third_party/xla/xla/service/op_expander_pass.h
+++ b/third_party/xla/xla/service/op_expander_pass.h
@@ -34,8 +34,14 @@ class OpExpanderPass : public HloModulePass {
 
   // extra_filter: Optional extra filtering criteria for matching instructions,
   // used in conjunction with InstructionMatchesPattern.
-  explicit OpExpanderPass(HloPredicate extra_filter = nullptr)
-      : extra_filter_(std::move(extra_filter)) {}
+  // preserve_sharding and relay_control_dependency: If we preserve sharding and
+  // relay control dependency when replacing the matched instructions.
+  explicit OpExpanderPass(HloPredicate extra_filter = nullptr,
+                          bool preserve_sharding = false,
+                          bool relay_control_dependency = false)
+      : extra_filter_(std::move(extra_filter)),
+        preserve_sharding_(preserve_sharding),
+        relay_control_dependency_(relay_control_dependency) {}
 
  protected:
   // Returns `true` if `instruction` should be expanded by this pass.
@@ -48,6 +54,8 @@ class OpExpanderPass : public HloModulePass {
       HloInstruction* instruction) = 0;
 
   HloPredicate extra_filter_;
+  const bool preserve_sharding_;
+  const bool relay_control_dependency_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation.cc b/third_party/xla/xla/service/p2p_schedule_preparation.cc
index 197699f66e436a..1771a99ea2227a 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation.cc
@@ -52,11 +52,6 @@ bool IsP2POp(const HloInstruction* op) {
   return p2p != nullptr && !p2p->is_host_transfer();
 }
 
-bool IsP2PDoneOp(const HloInstruction* op) {
-  return IsP2POp(op) && (op->opcode() == HloOpcode::kRecvDone ||
-                         op->opcode() == HloOpcode::kSendDone);
-}
-
 // Returns whether the instruction is a collective operation, for the purpose
 // of detecting whether the computation directly invokes collective
 // operations. As such, we only need to detect one of the instructions for a
@@ -111,21 +106,38 @@ struct P2PGroupNode {
     return computation == parent;
   }
 
-  bool RecordDoneOp(HloSendRecvInstruction* p2p) {
+  bool RecordP2POp(HloSendRecvInstruction* p2p) {
     if (!RecordParentComputation(p2p->parent())) {
       return false;
     }
 
-    if (p2p->opcode() == HloOpcode::kRecvDone) {
-      if (recv_done == nullptr) {
-        recv_done = Cast<HloRecvDoneInstruction>(p2p);
-        return true;
-      }
-    } else if (p2p->opcode() == HloOpcode::kSendDone) {
-      if (send_done == nullptr) {
-        send_done = Cast<HloSendDoneInstruction>(p2p);
-        return true;
-      }
+    switch (p2p->opcode()) {
+      case HloOpcode::kRecvDone:
+        if (recv_done == nullptr) {
+          recv_done = Cast<HloRecvDoneInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kSendDone:
+        if (send_done == nullptr) {
+          send_done = Cast<HloSendDoneInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kRecv:
+        if (recv == nullptr) {
+          recv = Cast<HloRecvInstruction>(p2p);
+          return true;
+        }
+        break;
+      case HloOpcode::kSend:
+        if (send == nullptr) {
+          send = Cast<HloSendInstruction>(p2p);
+          return true;
+        }
+        break;
+      default:
+        break;
     }
     return false;
   }
@@ -142,50 +154,14 @@ struct P2PGroupNode {
   }
 
   bool Incomplete() const {
-    return recv_done == nullptr || send_done == nullptr;
+    return recv_done == nullptr || send_done == nullptr || recv == nullptr ||
+           send == nullptr;
   }
 
   bool IncompletePipelinedParent() const {
     return Incomplete() || while_loop == nullptr;
   }
 
-  // Returns the Send/Recv instruction for the given Send-done/Recv-done
-  // instruction. The HLO verifier ensures that the operand of such a done
-  // instruction can only be a Send/Recv, or a value in a while-op result, or
-  // a parameter of a while-body.
-  HloSendRecvInstruction* GetAsyncStart(HloSendRecvInstruction* done) const {
-    HloInstruction* op = done->mutable_operand(0);
-    HloSendRecvInstruction* start = DynCast<HloSendRecvInstruction>(op);
-    if (start != nullptr) {
-      return start;
-    }
-
-    HloGetTupleElementInstruction* gte =
-        DynCast<HloGetTupleElementInstruction>(op);
-    int64_t tuple_index = gte->tuple_index();
-    if (gte->operand(0)->opcode() == HloOpcode::kWhile) {
-      // The op is a while-result, so the start-op should be a value in the
-      // while-op operands.
-      start = DynCast<HloSendRecvInstruction>(
-          gte->mutable_operand(0)->mutable_operand(0)->mutable_operand(
-              tuple_index));
-    } else {
-      // The op is a while-body parameter, so the start-op should be a value in
-      // the while-body result.
-      start = DynCast<HloSendRecvInstruction>(
-          computation->root_instruction()->mutable_operand(tuple_index));
-    }
-    return start;
-  }
-
-  HloSendInstruction* GetSend() const {
-    return DynCast<HloSendInstruction>(GetAsyncStart(send_done));
-  }
-
-  HloRecvInstruction* GetRecv() const {
-    return DynCast<HloRecvInstruction>(GetAsyncStart(recv_done));
-  }
-
   // Returns the pipeline stream used to execute the P2P instructions in the
   // group.
   P2PPipelineStream GetPipelineStream(const HloInstruction* start) const {
@@ -205,8 +181,8 @@ struct P2PGroupNode {
   // the pipeline group node, verifies they both have the same value and returns
   // the stream.
   P2PPipelineStream GetPipelineStream() const {
-    P2PPipelineStream send_stream = GetPipelineStream(GetSend());
-    P2PPipelineStream recv_stream = GetPipelineStream(GetRecv());
+    P2PPipelineStream send_stream = GetPipelineStream(send);
+    P2PPipelineStream recv_stream = GetPipelineStream(recv);
     if (send_stream != recv_stream) {
       return kUnknown;
     }
@@ -215,6 +191,8 @@ struct P2PGroupNode {
 
   HloRecvDoneInstruction* recv_done = nullptr;
   HloSendDoneInstruction* send_done = nullptr;
+  HloRecvInstruction* recv = nullptr;
+  HloSendInstruction* send = nullptr;
   // The computation that contains the Send and Recv instructions.
   HloComputation* computation = nullptr;
   // The while-loop instruction that calls the while-body with the pipelined
@@ -254,7 +232,7 @@ static constexpr int kPipelinedParentNodeIdx = 1;
 // If a group forms a cycle with another group, records the other group as a
 // complement group.
 struct P2PGroup {
-  Status RecordDoneOpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
+  Status RecordP2POpForUnpipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -263,13 +241,13 @@ struct P2PGroup {
       return Internal("Expected unpipelined group");
     }
     P2PGroupNode& node = nodes[kUnpipelinedNodeIdx];
-    if (!node.RecordDoneOp(p2p)) {
+    if (!node.RecordP2POp(p2p)) {
       kind = kUnrecognized;
     }
     return OkStatus();
   }
 
-  Status RecordDoneOpForPipelinedGroup(HloSendRecvInstruction* p2p) {
+  Status RecordP2POpForPipelinedGroup(HloSendRecvInstruction* p2p) {
     if (kind == kUnrecognized) {
       // Leave unrecognized P2P groups alone.
       return OkStatus();
@@ -281,7 +259,7 @@ struct P2PGroup {
       kind = kPipelined;
     }
     P2PGroupNode& node = nodes[kPipelinedParentNodeIdx];
-    if (!node.RecordDoneOp(p2p)) {
+    if (!node.RecordP2POp(p2p)) {
       kind = kUnrecognized;
     }
     return OkStatus();
@@ -362,7 +340,7 @@ struct P2PGroup {
   // collectives should be scheduled to.
   ChainStartEnd GetChainStartEnd(HloComputation* computation) const {
     if (kind == kUnpipelined) {
-      return std::make_pair(GetChild().GetRecv(), GetChild().send_done);
+      return std::make_pair(GetChild().recv, GetChild().send_done);
     }
 
     CHECK(kind == kPipelined);
@@ -370,18 +348,18 @@ struct P2PGroup {
       // For the child computation of a pipelined group, we return the start
       // and end of the instruction where we can put other collectives.
       if (complement_group == nullptr) {
-        return std::make_pair(GetChild().send_done, GetChild().GetRecv());
+        return std::make_pair(GetChild().send_done, GetChild().recv);
       }
       CHECK(pipeline_stream == kPipeline1);
-      return std::make_pair(GetChild().send_done, GetChild().GetRecv());
+      return std::make_pair(GetChild().send_done, GetChild().recv);
     }
 
     CHECK(computation == ParentComputation());
     if (complement_group == nullptr) {
-      return std::make_pair(GetParent().GetRecv(), GetParent().send_done);
+      return std::make_pair(GetParent().recv, GetParent().send_done);
     }
     CHECK(pipeline_stream == kPipeline1);
-    return std::make_pair(complement_group->GetParent().GetRecv(),
+    return std::make_pair(complement_group->GetParent().recv,
                           GetParent().send_done);
   }
 
@@ -419,13 +397,45 @@ bool MayInvokeCollectiveOp(
 Status MayAddWhileOpToPipelinedGroup(HloInstruction* while_op,
                                      P2PInComputation& p2p_in_computation,
                                      P2PGroupMap& p2p_group_map) {
+  if (while_op->while_init()->opcode() != HloOpcode::kTuple) {
+    // A while-init should contain the loop index variable. So if a while-init
+    // is not a tuple, it only contains the loop index variable and shouldn't
+    // contain any pipelined Send operand.
+    return OkStatus();
+  }
   HloComputation* body = while_op->called_computations()[0];
   auto p2p_in_while = p2p_in_computation.find(body);
   if (p2p_in_while == p2p_in_computation.end()) {
     return OkStatus();
   }
   int pipelined_group = 0;
-  for (auto hlo : while_op->operand(0)->operands()) {
+  // Check whether the while-op init contains a token from a Send result.
+  for (auto hlo : while_op->while_init()->operands()) {
+    if (hlo->opcode() == HloOpcode::kTuple) {
+      // A send has a tuple as its result, the tuple contains a token.
+      // If a send is pipelined, then, the while-init either contains
+      // a send-result, or contains a tuple with a token element from the
+      // send result. As such, if a tuple represent a pipelined send, it is
+      // either a direct send result, or a tuple with this code pattern:
+      ///
+      //   send = (..., token) send(...)
+      //   send.token = token[] get-tuple-element(send) index=...
+      //   send.tuple.reconstruct = tuple(..., send.token)
+      //   while-init =  tuple(..., send.tuple.reconstruct)
+      //   while-result =  while(while-init), ...
+      //
+      // So if the tuple contains a token, we make `hlo` point-to the producer
+      // of the token so that we can check whether the producer is a send after.
+      for (auto ele : hlo->operands()) {
+        if (ele->shape().IsToken()) {
+          // Assure that the token is part of an instruction result and not
+          // generated by a copy as we currently don't copy token.
+          CHECK(ele->opcode() == HloOpcode::kGetTupleElement);
+          hlo = ele->mutable_operand(0);
+          break;
+        }
+      }
+    }
     if (hlo->opcode() != HloOpcode::kSend) {
       continue;
     }
@@ -459,9 +469,9 @@ Status OrderBefore(HloInstruction* i1, HloInstruction* i2) {
 Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetChild();
   HloRecvDoneInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendDoneInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
   TF_RETURN_IF_ERROR(OrderBefore(send, recv_done));
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
@@ -474,9 +484,9 @@ Status ConnectUnpipelinedP2P(const P2PGroup& p2p_group) {
 Status ConnectPipelined1P2PChild(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetChild();
   HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
   TF_RETURN_IF_ERROR(OrderBefore(send_done, recv));
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
@@ -491,13 +501,13 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
   const P2PGroupNode& node0 = p2p_group.complement_group->GetChild();
   const P2PGroupNode& node1 = p2p_group.GetChild();
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
-  HloRecvInstruction* recv0 = node0.GetRecv();
+  HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
-  HloSendInstruction* send0 = node0.GetSend();
+  HloSendInstruction* send0 = node0.send;
   HloSendRecvInstruction* recv_done1 = node1.recv_done;
-  HloRecvInstruction* recv1 = node1.GetRecv();
+  HloRecvInstruction* recv1 = node1.recv;
   HloSendRecvInstruction* send_done1 = node1.send_done;
-  HloSendInstruction* send1 = node1.GetSend();
+  HloSendInstruction* send1 = node1.send;
 
   TF_RETURN_IF_ERROR(OrderBefore(recv_done0, send_done0));
   TF_RETURN_IF_ERROR(OrderBefore(send_done0, recv_done1));
@@ -516,9 +526,9 @@ Status ConnectPipelined2P2PChild(const P2PGroup& p2p_group) {
 Status ConnectPipelined1P2PParent(const P2PGroup& p2p_group) {
   const P2PGroupNode& node = p2p_group.GetParent();
   HloSendRecvInstruction* recv_done = node.recv_done;
-  HloRecvInstruction* recv = node.GetRecv();
+  HloRecvInstruction* recv = node.recv;
   HloSendRecvInstruction* send_done = node.send_done;
-  HloSendInstruction* send = node.GetSend();
+  HloSendInstruction* send = node.send;
   TF_RETURN_IF_ERROR(OrderBefore(recv, send));
   TF_RETURN_IF_ERROR(OrderBefore(recv_done, send_done));
   return OkStatus();
@@ -532,13 +542,13 @@ Status ConnectPipelined2P2PParent(const P2PGroup& p2p_group) {
   const P2PGroupNode& node0 = p2p_group.complement_group->GetParent();
   const P2PGroupNode& node1 = p2p_group.GetParent();
   HloSendRecvInstruction* recv_done0 = node0.recv_done;
-  HloRecvInstruction* recv0 = node0.GetRecv();
+  HloRecvInstruction* recv0 = node0.recv;
   HloSendRecvInstruction* send_done0 = node0.send_done;
-  HloSendInstruction* send0 = node0.GetSend();
+  HloSendInstruction* send0 = node0.send;
   HloSendRecvInstruction* recv_done1 = node1.recv_done;
-  HloRecvInstruction* recv1 = node1.GetRecv();
+  HloRecvInstruction* recv1 = node1.recv;
   HloSendRecvInstruction* send_done1 = node1.send_done;
-  HloSendInstruction* send1 = node1.GetSend();
+  HloSendInstruction* send1 = node1.send;
 
   TF_RETURN_IF_ERROR(OrderBefore(recv0, send0));
   TF_RETURN_IF_ERROR(OrderBefore(send0, recv1));
@@ -580,7 +590,7 @@ Status GatherP2PGroupsAndCollectiveInfo(
       while_ops.push_back(hlo);
       continue;
     }
-    if (!IsP2PDoneOp(hlo)) {
+    if (!IsP2POp(hlo)) {
       continue;
     }
     HloSendRecvInstruction* p2p = Cast<HloSendRecvInstruction>(hlo);
@@ -591,15 +601,15 @@ Status GatherP2PGroupsAndCollectiveInfo(
       // P2P group and may turn it into a kPipelined group or kUnrecognized
       // group.
       P2PGroup group;
-      TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+      TF_RETURN_IF_ERROR(group.RecordP2POpForUnpipelinedGroup(p2p));
       p2p_group_map[channel] = group;
     } else {
       P2PGroup& group = p2p_group->second;
       if (group.ChildComputation() == computation) {
-        TF_RETURN_IF_ERROR(group.RecordDoneOpForUnpipelinedGroup(p2p));
+        TF_RETURN_IF_ERROR(group.RecordP2POpForUnpipelinedGroup(p2p));
       } else {
         // We are at the parent computation for a pipelined P2P group.
-        TF_RETURN_IF_ERROR(group.RecordDoneOpForPipelinedGroup(p2p));
+        TF_RETURN_IF_ERROR(group.RecordP2POpForPipelinedGroup(p2p));
       }
     }
     // We can't rely on the operation on p2p_group_map above to find out
diff --git a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
index 393734a8c41f8b..f4d04a56556399 100644
--- a/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
+++ b/third_party/xla/xla/service/p2p_schedule_preparation_test.cc
@@ -412,10 +412,24 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     param = (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) parameter(0)
     count = get-tuple-element(param), index=0
-    send.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=2
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
+    send.1.q.t = (f32[1,1024,1024], token[]) get-tuple-element(param), index=1
+    send.1.q.data = f32[1,1024,1024] get-tuple-element(send.1.q.t), index=0
+    send.1.q.data.copy = f32[1,1024,1024] copy(send.1.q.data)
+    send.1.q.token = token[] get-tuple-element(send.1.q.t), index=1
+    send.1.q = (f32[1, 1024, 1024], token[]) tuple(send.1.q.data.copy, send.1.q.token)
+
     recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     c1 = u32[] constant(1)
@@ -441,11 +455,18 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
       _xla_send_recv_pipeline="0"
     }
-    recv.1 = (f32[1, 1024, 1024], u32[], token[]) recv(after-all.1), channel_id=1,
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
+    recv.1 = (f32[1, 1024, 1024], token[]) recv(after-all.1), channel_id=1,
       frontend_attributes={
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
+    recv.1.data = f32[1,1024,1024] get-tuple-element(recv.1), index=0
+    recv.1.data.copy = f32[1,1024,1024] copy(recv.1.data)
+    recv.1.token = token[] get-tuple-element(recv.1), index=1
+    recv.1.tuple = (f32[1,1024,1024], token[]) tuple(recv.1.data.copy, recv.1.token)
 
     ROOT body-result = (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) tuple(new-count, recv.1, send.1)
@@ -458,8 +479,14 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
     count = get-tuple-element(param), index=0
     send.1.q = (f32[1, 1024, 1024], token[]) get-tuple-element(param), index=2
     recv.1.q = (f32[1, 1024, 1024], token[])get-tuple-element(param), index=1
-    send-done.1 = token[] send-done(send.1.q), channel_id=1
-    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1
+    send-done.1 = token[] send-done(send.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
+    recv-done.1 = token[] recv-done(recv.1.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data = f32[1, 1024, 1024] get-tuple-element(recv-done.1), index=0
 
     c1 = u32[] constant(1)
@@ -536,24 +563,37 @@ std::string GetPipelinedP2PModuleString(bool nested_p2p_in_main = false,
        _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
        _xla_send_recv_pipeline="0"
     }
+
+    // Mimic the code transformation done by copy-insertion to complicate
+    // the code pattern.
     send.2 = (f32[1, 1024, 1024], token[]) send(init, after-all.2),
       channel_id=1, frontend_attributes={
       _xla_send_recv_source_target_pairs="{{0,1}, {1,2}, {2,3}, {3,4}}",
       _xla_send_recv_pipeline="0"
     }
+    send.2.data = f32[1,1024,1024] get-tuple-element(send.2), index=0
+    send.2.data.copy = f32[1,1024,1024] copy(send.2.data)
+    send.2.token = token[] get-tuple-element(send.2), index=1
+    send.2.tuple = (f32[1,1024,1024], token[]) tuple(send.2.data.copy, send.2.token)
 
     while-init =  (u32[], (f32[1, 1024, 1024], token[]),
-      (f32[1, 1024, 1024], token[])) tuple(c0, recv.2, send.2)
+      (f32[1, 1024, 1024], token[])) tuple(c0, recv.2, send.2.tuple)
     while-result =  (u32[], (f32[1, 1024, 1024], token[]),
       (f32[1, 1024, 1024], token[])) while(while-init),
       body=while-body, condition=while-cond,
       backend_config={"known_trip_count":{"n":"25"}}
 
     recv.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=1
-    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2.q), channel_id=1
+    recv-done.2 = (f32[1, 1024, 1024], token[]) recv-done(recv.2.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.2.q = f32[1, 1024, 1024] get-tuple-element(recv-done.2), index=0
     send.2.q = (f32[1, 1024, 1024], token[]) get-tuple-element(while-result), index=2
-    send-done.2 = token[] send-done(send.2.q), channel_id=1
+    send-done.2 = token[] send-done(send.2.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
 
     // The code for the computation result goes here.
     %s
@@ -692,11 +732,17 @@ body {
     count = get-tuple-element(param), index=0
 
     recv.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=1
-    recv-done.0 = (u32[2], token[]) recv-done(recv.0.f), channel_id=1
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0.f), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
 
     recv.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=2
-    recv-done.1 = (u32[2], token[]) recv-done(recv.1.f), channel_id=2
+    recv-done.1 = (u32[2], token[]) recv-done(recv.1.f), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
     recv-data.1 = u32[2] get-tuple-element(recv-done.1), index=0
 
     replica = u32[] replica-id()
@@ -712,9 +758,15 @@ body {
     s = u32[2] add(r, recv-data)
 
     send.0.f = (u32[2], u32[], token[]) get-tuple-element(param), index=3
-    send-done.0 = token[] send-done(send.0.f), channel_id=1
+    send-done.0 = token[] send-done(send.0.f), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.1.f = (u32[2], u32[], token[]) get-tuple-element(param), index=4
-    send-done.1 = token[] send-done(send.1.f), channel_id=2
+    send-done.1 = token[] send-done(send.1.f), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
 
     // The Recv "rotated" from the beginning of the loop to the end of the loop.
     after-all.0.n = token[] after-all()
@@ -792,11 +844,17 @@ body {
     // Use .q as suffix for HLO name.
 
      recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=1
-     recv-done.2 = (u32[2], token[]) recv-done(recv.0.q), channel_id=1
+     recv-done.2 = (u32[2], token[]) recv-done(recv.0.q), channel_id=1,
+       frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
      recv-data.0.q = u32[2] get-tuple-element(recv-done.2), index=0
 
      recv.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
-     recv-done.3 = (u32[2], token[]) recv-done(recv.1.q), channel_id=2
+     recv-done.3 = (u32[2], token[]) recv-done(recv.1.q), channel_id=2,
+       frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
      recv-data.1.q = u32[2] get-tuple-element(recv-done.2), index=0
 
     replica = u32[] replica-id()
@@ -808,9 +866,15 @@ body {
     s = u32[2] add(c1, recv-data)
 
     send.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=3
-    send-done.2 = token[] send-done(send.0.q), channel_id=1
+    send-done.2 = token[] send-done(send.0.q), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_pipeline="0"
+      }
     send.1.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=4
-    send-done.3 = token[] send-done(send.1.q), channel_id=2
+    send-done.3 = token[] send-done(send.1.q), channel_id=2,
+      frontend_attributes={
+        _xla_send_recv_pipeline="1"
+      }
 
     ROOT result = u32[2] add(s, recv-data)
   }
diff --git a/third_party/xla/xla/service/qr_expander.cc b/third_party/xla/xla/service/qr_expander.cc
index cc9b10b0905fec..e817b66b61d2c8 100644
--- a/third_party/xla/xla/service/qr_expander.cc
+++ b/third_party/xla/xla/service/qr_expander.cc
@@ -509,9 +509,12 @@ bool QrExpander::InstructionMatchesPattern(HloInstruction* instruction) {
 
 absl::StatusOr<HloInstruction*> QrExpander::ExpandInstruction(
     HloInstruction* instruction) {
-  const std::string name =
+  std::string name =
       absl::StrFormat("xla.%s_%s", instruction->custom_call_target(),
                       instruction->operand(0)->shape().ToString());
+  if (instruction->custom_call_target() == kHouseholderProductCustomCallName) {
+    name += "_" + instruction->operand(1)->shape().ToString();
+  }
 
   HloModule* module = instruction->GetModule();
 
diff --git a/third_party/xla/xla/service/scatter_expander.cc b/third_party/xla/xla/service/scatter_expander.cc
index f8faafa963e35d..f76c88cabda01f 100644
--- a/third_party/xla/xla/service/scatter_expander.cc
+++ b/third_party/xla/xla/service/scatter_expander.cc
@@ -497,7 +497,7 @@ bool IsCombinerAssociative(const HloComputation* combiner) {
     case HloOpcode::kMinimum:
     case HloOpcode::kMaximum:
       return true;
-    // Other common combiners are associative at least for interger arithmetic.
+    // Other common combiners are associative at least for integer arithmetic.
     case HloOpcode::kAdd:
     case HloOpcode::kMultiply:
     case HloOpcode::kOr:
diff --git a/third_party/xla/xla/service/service.cc b/third_party/xla/xla/service/service.cc
index 3c48e36215bb86..9017774ce2e51d 100644
--- a/third_party/xla/xla/service/service.cc
+++ b/third_party/xla/xla/service/service.cc
@@ -790,10 +790,10 @@ absl::StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       buffer_assignment_proto_after_opt != nullptr) {
     CHECK(DumpingEnabledForHloModule(executable->module()));
     *hlo_proto_before_opt->mutable_buffer_assignment() =
-        std::move(*buffer_assignment_proto_after_opt);
+        *buffer_assignment_proto_after_opt;
     executable->set_hlo_proto(std::move(hlo_proto_before_opt));
   }
-  return std::move(executable);
+  return executable;
 }
 
 Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
diff --git a/third_party/xla/xla/service/service_executable_run_options.h b/third_party/xla/xla/service/service_executable_run_options.h
index 6013ca758d42ba..a2f57e6bbae635 100644
--- a/third_party/xla/xla/service/service_executable_run_options.h
+++ b/third_party/xla/xla/service/service_executable_run_options.h
@@ -42,9 +42,9 @@ class ServiceExecutableRunOptions {
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
 
-  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
+  explicit ServiceExecutableRunOptions(const ExecutableRunOptions& run_options,
                                        StreamBorrower stream_borrower = nullptr)
-      : run_options_(std::move(run_options)),
+      : run_options_(run_options),
         stream_borrower_(std::move(stream_borrower)) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index d73e5d1a364a6d..781852cd41d9dd 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/test_helpers.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -82,10 +83,11 @@ class ReduceShapeInferenceTest : public ShapeInferenceTest {
       const Shape& expected_inferred_shape, const Shape& arg,
       absl::Span<const int64_t> dimensions_to_reduce) {
     ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-    auto inferred_status = ShapeInference::InferReduceShape(
-        {&arg, &f32_}, dimensions_to_reduce, to_apply);
-    EXPECT_IS_OK(inferred_status.status());
-    EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, *inferred_status));
+    const absl::StatusOr<Shape> inferred_shape =
+        ShapeInference::InferReduceShape({&arg, &f32_}, dimensions_to_reduce,
+                                         to_apply);
+    EXPECT_IS_OK(inferred_shape.status());
+    EXPECT_TRUE(ShapeUtil::Equal(expected_inferred_shape, *inferred_shape));
   }
 };
 
@@ -127,8 +129,8 @@ struct BinaryOpTestCase {
   std::optional<std::string_view> error_message;
 };
 
-// Subclass for testing unbounded dynamic and op
-class UnboundedAndOpShapeInferenceTest
+// Subclass for testing unbounded dynamic logical ops
+class UnboundedLogicalOpShapeInferenceTest
     : public ::testing::TestWithParam<BinaryOpTestCase> {};
 
 // Subclass for testing unbounded dynamic binary ops
@@ -162,125 +164,139 @@ class UnboundedSelectOpShapeInferenceTest
     : public ::testing::TestWithParam<std::vector<std::string>> {};
 
 TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferUnaryOpShape(HloOpcode::kNegate, matrix_shape);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) {
-  Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, tuple, tuple);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_, tuple,
+                                          tuple);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Expected array argument for select"));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) {
-  auto predarray = ShapeUtil::MakeShape(PRED, {64, 48});
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, predarray, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const Shape predarray = ShapeUtil::MakeShape(PRED, {64, 48});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, predarray,
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, SelectBadShapes) {
-  auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_32_64_);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, pred_,
+                                          matrix_64_48_, matrix_32_64_);
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Operands to select must be the same shape"));
 
-  auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, s32_, matrix_64_48_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, s32_,
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("pred operand must have PRED"));
 
-  auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, ShapeUtil::MakeShape(PRED, {64}), matrix_64_48_,
-      matrix_64_48_);
-  ASSERT_FALSE(inferred_status_error3.ok());
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kSelect,
+                                          ShapeUtil::MakeShape(PRED, {64}),
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_FALSE(inferred_shape_error3.ok());
   ASSERT_THAT(
-      inferred_status_error3.status().message(),
+      inferred_shape_error3.status().message(),
       HasSubstr("Operands to select and predicate must be the same shape"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
-  auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
-      ShapeUtil::MakeTupleShape({f32_, f32_}),
-      ShapeUtil::MakeTupleShape({f32_, f32_}));
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferTernaryOpShape(
+          HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
+          ShapeUtil::MakeTupleShape({f32_, f32_}),
+          ShapeUtil::MakeTupleShape({f32_, f32_}));
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("Expected array argument for select pred"));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllScalar) {
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
+                                          matrix_64_48_, matrix_64_48_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          f32_, matrix_64_48_);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, matrix_64_48_, f32_, f32_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, matrix_64_48_,
+                                          f32_, f32_);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, f32_, matrix_64_48_);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_,
+                                          matrix_64_48_);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Clamp with incompatible shapes"));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      HloOpcode::kClamp, f32_, matrix_64_48_, f32_);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
+                                          matrix_64_48_, f32_);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ClampBadShapes) {
@@ -317,8 +333,8 @@ TEST_F(ShapeInferenceTest, ClampBadShapes) {
 }
 
 TEST_F(ShapeInferenceTest, Complex) {
-  auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
-                           absl::Span<const int64_t> bcast) {
+  const auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
+                                 absl::Span<const int64_t> bcast) {
     return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
                                               bcast);
   };
@@ -330,7 +346,7 @@ TEST_F(ShapeInferenceTest, Complex) {
   // Only F32->C64 and F64->C128 supported.
   ASSERT_FALSE(complex_shape(f16_, f16_, {}).ok());
   // Validate correct uses.
-  Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
+  const Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
   TF_ASSERT_OK_AND_ASSIGN(Shape result, complex_shape(f32_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, ShapeUtil::MakeShape(C64, {})));
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(vector_32_, f32_, {}));
@@ -340,7 +356,7 @@ TEST_F(ShapeInferenceTest, Complex) {
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(vector_32_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32));
 
-  Shape c64_32_64 = ShapeUtil::MakeShape(C64, {32, 64});
+  const Shape c64_32_64 = ShapeUtil::MakeShape(C64, {32, 64});
   TF_ASSERT_OK_AND_ASSIGN(result,
                           complex_shape(vector_64_, matrix_32_64_, {1}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
@@ -358,7 +374,7 @@ TEST_F(ShapeInferenceTest, Complex) {
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
-  absl::StatusOr<Shape> result =
+  const absl::StatusOr<Shape> result =
       ShapeInference::InferVariadicOpShape(HloOpcode::kTuple, {&s32_, &f32_});
   ASSERT_IS_OK(result.status());
   ASSERT_TRUE(
@@ -366,7 +382,7 @@ TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
 }
 
 TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {8, 8});
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Window window;
   WindowDimension dim;
   dim.set_size(2);
@@ -377,78 +393,85 @@ TEST_F(ShapeInferenceTest, ReduceWindowInHalf) {
   dim.set_base_dilation(1);
   *window.add_dimensions() = dim;
   *window.add_dimensions() = dim;
-  Shape window_shape = ShapeUtil::MakeShape(F32, {2, 2});
-  Shape init_value_shape = ShapeUtil::MakeShape(F32, {});
-  Shape float_scalar = ShapeUtil::MakeShape(F32, {});
+  const Shape window_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  const Shape init_value_shape = ShapeUtil::MakeShape(F32, {});
+  const Shape float_scalar = ShapeUtil::MakeShape(F32, {});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      matrix_shape, init_value_shape, window, to_apply);
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReduceWindowShape(matrix_shape, init_value_shape,
+                                             window, to_apply);
 
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 4}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 4}), *inferred_shape));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterProperShapes) {
-  auto inferred_status_ok = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_IS_OK(inferred_status_ok.status());
-  ASSERT_TRUE(ShapeUtil::Equal(operand_shape_, *inferred_status_ok));
+  const absl::StatusOr<Shape> inferred_shape_ok =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_IS_OK(inferred_shape_ok.status());
+  ASSERT_TRUE(ShapeUtil::Equal(operand_shape_, *inferred_shape_ok));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSourceShape) {
-  Shape source_shape_fail = ShapeUtil::MakeShape(F32, {4, 6});
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_, window_, source_shape_fail,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  const Shape source_shape_fail = ShapeUtil::MakeShape(F32, {4, 6});
+  const absl::StatusOr<Shape> inferred_shape_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_, window_, source_shape_fail,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Source shape does not match"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape1) {
   ProgramShape select_program_shape_fail =
       ShapeUtil::MakeProgramShape({ShapeUtil::MakeShape(F32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function must take 2 parameters"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape2) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})}, f32_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function must have rank-0 PRED"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape3) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function's first parameter"));
 }
 
 TEST_F(SelectAndScatterShapeInferenceTest, SelectAndScatterWrongSelectShape4) {
   ProgramShape select_program_shape_fail = ShapeUtil::MakeProgramShape(
       {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(U32, {})}, pred_);
-  auto inferred_status_fail = ShapeInference::InferSelectAndScatterShape(
-      operand_shape_, select_program_shape_fail, window_, source_shape_,
-      init_value_shape_, scatter_program_shape_);
-  ASSERT_FALSE(inferred_status_fail.ok());
-  ASSERT_THAT(inferred_status_fail.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_fail =
+      ShapeInference::InferSelectAndScatterShape(
+          operand_shape_, select_program_shape_fail, window_, source_shape_,
+          init_value_shape_, scatter_program_shape_);
+  ASSERT_FALSE(inferred_shape_fail.ok());
+  ASSERT_THAT(inferred_shape_fail.status().message(),
               HasSubstr("Select function's second parameter"));
 }
 
@@ -457,8 +480,9 @@ TEST_F(ShapeInferenceTest, AllGatherStart) {
   const Shape expected_shape = ShapeUtil::MakeTupleShape(
       {operand, ShapeUtil::MakeShape(F32, {8, 8, 4})});
 
-  auto inferred_ag_shape = ShapeInference::InferAllGatherStartShape(
-      {&operand}, /*all_gather_dimension=*/0, /*shard_count=*/8);
+  const absl::StatusOr<Shape> inferred_ag_shape =
+      ShapeInference::InferAllGatherStartShape(
+          {&operand}, /*all_gather_dimension=*/0, /*shard_count=*/8);
   EXPECT_TRUE(inferred_ag_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_shape, expected_shape));
 }
@@ -475,8 +499,10 @@ TEST_F(ShapeInferenceTest, AllGatherStartMultiOperand) {
        ShapeUtil::MakeTupleShape(
            {expected_output0_shape, expected_output1_shape})});
 
-  auto inferred_ag_shape = ShapeInference::InferAllGatherStartShape(
-      {&operand0, &operand1}, /*all_gather_dimension=*/0, /*shard_count=*/8);
+  const absl::StatusOr<Shape> inferred_ag_shape =
+      ShapeInference::InferAllGatherStartShape({&operand0, &operand1},
+                                               /*all_gather_dimension=*/0,
+                                               /*shard_count=*/8);
   EXPECT_TRUE(inferred_ag_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_shape, expected_shape));
 }
@@ -487,7 +513,7 @@ TEST_F(ShapeInferenceTest, AllGatherDone) {
                                  ShapeUtil::MakeShape(F32, {8, 8, 4})});
   const Shape expected_shape = ShapeUtil::MakeShape(F32, {8, 8, 4});
 
-  auto inferred_ag_done_shape =
+  const absl::StatusOr<Shape> inferred_ag_done_shape =
       ShapeInference::InferAllGatherDoneShape(input_shape);
   EXPECT_TRUE(inferred_ag_done_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_done_shape, expected_shape));
@@ -508,7 +534,7 @@ TEST_F(ShapeInferenceTest, AllGatherDoneMultiOperand) {
   const Shape expected_shape = ShapeUtil::MakeTupleShape(
       {expected_output0_shape, expected_output1_shape});
 
-  auto inferred_ag_done_shape =
+  const absl::StatusOr<Shape> inferred_ag_done_shape =
       ShapeInference::InferAllGatherDoneShape(input_shape);
   EXPECT_TRUE(inferred_ag_done_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_ag_done_shape, expected_shape));
@@ -518,7 +544,7 @@ TEST_F(ShapeInferenceTest, Convolve) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -529,15 +555,15 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(3);
   dim0->set_stride(2);
   dim0->set_padding_low(1);
@@ -550,19 +576,21 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dim1->set_padding_high(0);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 2, 3}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 103, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 103, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -573,14 +601,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
   dim0->set_size(3);
   dim0->set_stride(3);
   dim0->set_padding_low(0);
@@ -588,26 +616,28 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dim0->set_window_dilation(6);
   dim0->set_base_dilation(1);
 
-  auto dim1 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim1->set_size(2);
   dim1->set_stride(1);
   dim1->set_padding_low(2);
   dim1->set_padding_high(1);
   dim1->set_window_dilation(2);
   dim1->set_base_dilation(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 31, 5}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   ConvolutionDimensionNumbers dnums;
 
   // Dimension order: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
@@ -618,14 +648,14 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
   dnums.set_kernel_input_feature_dimension(2);
   dnums.set_kernel_output_feature_dimension(1);
   dnums.add_kernel_spatial_dimensions(3);
   dnums.add_kernel_spatial_dimensions(0);
 
   Window window;
-  auto dim0 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
   dim0->set_size(4);
   dim0->set_stride(3);
   dim0->set_padding_low(0);
@@ -633,25 +663,27 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dim0->set_window_dilation(1);
   dim0->set_base_dilation(6);
 
-  auto dim1 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim1->set_size(2);
   dim1->set_stride(1);
   dim1->set_padding_low(2);
   dim1->set_padding_high(1);
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(2);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 4, 9}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   // Dimension order for this test: batch, feature, x0, x1
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {12, 11, 3, 2});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {10, 11, 3, 4});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {12, 11, 3, 2});
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(3);
@@ -668,8 +700,8 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dnums.add_kernel_spatial_dimensions(1);
 
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(2);
   dim0->set_stride(1);
   dim0->set_padding_low(0);
@@ -678,11 +710,13 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dim1->set_stride(2);
   dim1->set_padding_low(1);
   dim1->set_padding_high(1);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/1, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("each dimension exactly once"));
 }
 
@@ -700,11 +734,11 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
   dnums.set_output_feature_dimension(1);
   dnums.add_output_spatial_dimensions(2);
   dnums.add_output_spatial_dimensions(3);
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {60, 38, 17, 13});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {38, 10, 4, 4});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {60, 38, 17, 13});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {38, 10, 4, 4});
   Window window;
-  auto dim0 = window.add_dimensions();
-  auto dim1 = window.add_dimensions();
+  const auto dim0 = window.add_dimensions();
+  const auto dim1 = window.add_dimensions();
   dim0->set_size(4);
   dim1->set_size(4);
   dim0->set_padding_low(0);
@@ -715,11 +749,13 @@ TEST_F(ShapeInferenceTest, ConvolveBatchGroupCountUnequalOutputFeature) {
   dim1->set_stride(1);
   dim0->set_window_dilation(3);
   dim1->set_window_dilation(2);
-  auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/6,
-      window, dnums, /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConvolveShape(
+          lhs_shape, rhs_shape, /*feature_group_count=*/1,
+          /*batch_group_count=*/6, window, dnums,
+          /*preferred_element_type=*/std::nullopt);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("to be a multiple of batch group count"));
 }
 
@@ -772,7 +808,7 @@ ConvolveArgs MakeConvolveArgs(PrimitiveType lhs_type, PrimitiveType rhs_type) {
 TEST_F(ShapeInferenceTest, ConvolveWithBF16_F16) {
   ConvolveArgs args = MakeConvolveArgs(BF16, F16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -784,7 +820,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithBF16_F16) {
 TEST_F(ShapeInferenceTest, ConvolveWithF16_BF16) {
   ConvolveArgs args = MakeConvolveArgs(F16, BF16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -796,7 +832,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithF16_BF16) {
 TEST_F(ShapeInferenceTest, ConvolveWithS32_U32) {
   ConvolveArgs args = MakeConvolveArgs(S32, U32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -808,7 +844,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithS32_U32) {
 TEST_F(ShapeInferenceTest, ConvolveWithU32_S32) {
   ConvolveArgs args = MakeConvolveArgs(U32, S32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -820,7 +856,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithU32_S32) {
 TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -832,7 +868,7 @@ TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementType) {
 TEST_F(ShapeInferenceTest, ConvolveWithPreferredElementTypeSameAsInferredType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -845,7 +881,7 @@ TEST_F(ShapeInferenceTest,
        FloatingPointConvolveWithNarrowerPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(F32, F32);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -858,7 +894,7 @@ TEST_F(ShapeInferenceTest,
        FloatingPointConvolveWithIntegralPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(BF16, BF16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -871,7 +907,7 @@ TEST_F(ShapeInferenceTest,
        IntegralConvolveWithFloatingPointPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -884,7 +920,7 @@ TEST_F(ShapeInferenceTest,
        ConvolveWithPreferredElementTypeWithDifferentSignedness) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -896,7 +932,7 @@ TEST_F(ShapeInferenceTest,
 TEST_F(ShapeInferenceTest, ConvolveWithNarrowerPreferredElementType) {
   ConvolveArgs args = MakeConvolveArgs(S8, S16);
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferConvolveShape(
           args.lhs_shape, args.rhs_shape, /*feature_group_count=*/1,
           /*batch_group_count=*/1, args.window, args.dnums,
@@ -918,16 +954,18 @@ static const char* innermost_dimension_matches =
 static void Pass(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length,
                  const Shape& expected_shape) {
-  auto inferred_status = ShapeInference::InferFftShape(shape, type, length);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferFftShape(shape, type, length);
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(expected_shape, *inferred_shape));
 }
 
 static void Fail(const Shape& shape, FftType type,
                  absl::Span<const int64_t> length, absl::string_view message) {
-  auto inferred_status = ShapeInference::InferFftShape(shape, type, length);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferFftShape(shape, type, length);
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr(std::string(message)));
 }
 
@@ -935,7 +973,7 @@ static void Fail(const Shape& shape, FftType type,
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanks) {
   FftType type = FftType::FFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
   fft::Fail(shape, type, {}, fft::unsupported_rank);
   fft::Pass(shape, type, {8}, shape);
   fft::Pass(shape, type, {16, 8}, shape);
@@ -945,15 +983,15 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestFftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestFftTypes) {
   FftType type = FftType::FFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
   FftType type = FftType::IFFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 8});
   fft::Fail(shape, type, {}, fft::unsupported_rank);
   fft::Pass(shape, type, {8}, shape);
   fft::Pass(shape, type, {16, 8}, shape);
@@ -963,16 +1001,16 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftTypes) {
   FftType type = FftType::IFFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
   FftType type = FftType::RFFT;
-  Shape shape_in = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape_in = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Fail(shape_in, type, {}, fft::unsupported_rank);
   fft::Pass(shape_in, type, {8}, shape_out);
   fft::Pass(shape_in, type, {16, 8}, shape_out);
@@ -982,36 +1020,36 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftDimensions) {
   FftType type = FftType::RFFT;
-  Shape shape = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape = ShapeUtil::MakeShape(F32, {16, 8});
   fft::Fail(shape, type, {4}, fft::dimensions_match);
   fft::Fail(shape, type, {16, 4}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 8}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 16}, fft::dimensions_match);
 
-  Shape zero_shape_in = ShapeUtil::MakeShape(F32, {16, 0});
-  Shape zero_shape_out = ShapeUtil::MakeShape(C64, {16, 0});
+  const Shape zero_shape_in = ShapeUtil::MakeShape(F32, {16, 0});
+  const Shape zero_shape_out = ShapeUtil::MakeShape(C64, {16, 0});
   fft::Pass(zero_shape_in, type, {0}, zero_shape_out);
   fft::Pass(zero_shape_in, type, {16, 0}, zero_shape_out);
 
-  Shape even_shape_in = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape odd_shape_in = ShapeUtil::MakeShape(F32, {16, 9});
-  Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape even_shape_in = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape odd_shape_in = ShapeUtil::MakeShape(F32, {16, 9});
+  const Shape shape_out = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Pass(even_shape_in, type, {16, 8}, shape_out);
   fft::Pass(odd_shape_in, type, {16, 9}, shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftTypes) {
   FftType type = FftType::RFFT;
-  Shape shape_c64 = ShapeUtil::MakeShape(C64, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
+  const Shape shape_c64 = ShapeUtil::MakeShape(C64, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_c64, type, {16, 8}, fft::requires_f32_input);
   fft::Fail(shape_c128, type, {16, 8}, fft::requires_f32_input);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftRanks) {
   FftType type = FftType::IRFFT;
-  Shape shape_in = ShapeUtil::MakeShape(C64, {16, 5});
-  Shape shape_out = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_in = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape_out = ShapeUtil::MakeShape(F32, {16, 8});
   fft::Fail(shape_in, type, {}, fft::unsupported_rank);
   fft::Pass(shape_in, type, {8}, shape_out);
   fft::Pass(shape_in, type, {16, 8}, shape_out);
@@ -1021,142 +1059,151 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftRanks) {
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftDimensions) {
   FftType type = FftType::IRFFT;
-  Shape shape = ShapeUtil::MakeShape(C64, {16, 5});
+  const Shape shape = ShapeUtil::MakeShape(C64, {16, 5});
   fft::Fail(shape, type, {5}, fft::innermost_dimension_matches);
   fft::Fail(shape, type, {16, 5}, fft::innermost_dimension_matches);
   fft::Fail(shape, type, {8, 8}, fft::dimensions_match);
   fft::Fail(shape, type, {8, 9}, fft::dimensions_match);
 
-  Shape zero_shape_in = ShapeUtil::MakeShape(C64, {16, 0});
-  Shape zero_shape_out = ShapeUtil::MakeShape(F32, {16, 0});
+  const Shape zero_shape_in = ShapeUtil::MakeShape(C64, {16, 0});
+  const Shape zero_shape_out = ShapeUtil::MakeShape(F32, {16, 0});
   fft::Pass(zero_shape_in, type, {0}, zero_shape_out);
   fft::Pass(zero_shape_in, type, {16, 0}, zero_shape_out);
 
-  Shape even_shape_out = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape odd_shape_out = ShapeUtil::MakeShape(F32, {16, 9});
+  const Shape even_shape_out = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape odd_shape_out = ShapeUtil::MakeShape(F32, {16, 9});
   fft::Pass(shape, type, {16, 8}, even_shape_out);
   fft::Pass(shape, type, {16, 9}, odd_shape_out);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
   FftType type = FftType::IRFFT;
-  Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
-  Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
+  const Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
+  const Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
+  const Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
   fft::Pass(shape_c128, type, {16, 8}, shape_f64_out);
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {
-  Shape arg = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, s32_);
-  auto inferred_status = ShapeInference::InferMapShape({&arg}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
-  Shape expected = ShapeUtil::MakeShape(S32, {20});
-  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferMapShape({&arg}, to_apply, {0});
+  EXPECT_IS_OK(inferred_shape.status());
+  const Shape expected = ShapeUtil::MakeShape(S32, {20});
+  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, Map) {
-  auto inferred_status_r1f32 = ShapeInference::InferMapShape(
-      {&vector_32_, &vector_32_},
-      ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
-  EXPECT_IS_OK(inferred_status_r1f32.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32));
+  const absl::StatusOr<Shape> inferred_shape_r1f32 =
+      ShapeInference::InferMapShape(
+          {&vector_32_, &vector_32_},
+          ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
+  EXPECT_IS_OK(inferred_shape_r1f32.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape_r1f32));
 
   // It's OK to provide a single argument, as long as the applied arity matches
   // (this degenerates to a Map).
-  auto inferred_status_r1f32_one = ShapeInference::InferMapShape(
-      {&vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_), {0});
-  EXPECT_IS_OK(inferred_status_r1f32_one.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status_r1f32_one));
-
-  auto inferred_status_r2s32 = ShapeInference::InferMapShape(
-      {&s32matrix_64_64_, &s32matrix_64_64_, &s32matrix_64_64_},
-      ShapeUtil::MakeProgramShape({s32_, s32_, s32_}, s32_), {0, 1});
-  EXPECT_IS_OK(inferred_status_r2s32.status());
-  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status_r2s32));
-
-  auto no_args_error = ShapeInference::InferMapShape(
+  const absl::StatusOr<Shape> inferred_shape_r1f32_one =
+      ShapeInference::InferMapShape(
+          {&vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_), {0});
+  EXPECT_IS_OK(inferred_shape_r1f32_one.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape_r1f32_one));
+
+  const absl::StatusOr<Shape> inferred_shape_r2s32 =
+      ShapeInference::InferMapShape(
+          {&s32matrix_64_64_, &s32matrix_64_64_, &s32matrix_64_64_},
+          ShapeUtil::MakeProgramShape({s32_, s32_, s32_}, s32_), {0, 1});
+  EXPECT_IS_OK(inferred_shape_r2s32.status());
+  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_shape_r2s32));
+
+  const auto no_args_error = ShapeInference::InferMapShape(
       {}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {});
   ASSERT_FALSE(no_args_error.ok());
   ASSERT_THAT(no_args_error.status().message(),
               HasSubstr("expects at least one argument"));
 
-  auto args_diff_shapes_error = ShapeInference::InferMapShape(
+  const auto args_diff_shapes_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_64_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
   ASSERT_FALSE(args_diff_shapes_error.ok());
   ASSERT_THAT(args_diff_shapes_error.status().message(),
               HasSubstr("requires all operands to have the same shape"));
 
-  auto arity_error = ShapeInference::InferMapShape(
+  const auto arity_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_}, ShapeUtil::MakeProgramShape({f32_}, f32_),
       {0});
   ASSERT_FALSE(arity_error.ok());
   ASSERT_THAT(arity_error.status().message(),
               HasSubstr("function arity must match"));
 
-  auto output_shape_error = ShapeInference::InferMapShape(
+  const auto output_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, f32_}, vector_32_), {0});
   ASSERT_FALSE(output_shape_error.ok());
   ASSERT_THAT(output_shape_error.status().message(),
               HasSubstr("result has to be a scalar"));
 
-  auto param_shape_error = ShapeInference::InferMapShape(
+  const auto param_shape_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({vector_32_, f32_}, f32_), {0});
   ASSERT_FALSE(param_shape_error.ok());
   ASSERT_THAT(param_shape_error.status().message(),
               HasSubstr("parameter has to be a scalar"));
 
-  auto param_element_type_error = ShapeInference::InferMapShape(
+  const auto param_element_type_error = ShapeInference::InferMapShape(
       {&vector_32_, &vector_32_},
       ShapeUtil::MakeProgramShape({f32_, s32_}, f32_), {0});
   ASSERT_FALSE(param_element_type_error.ok());
   ASSERT_THAT(param_element_type_error.status().message(),
               HasSubstr("parameter type has to match argument"));
 
-  Shape arg = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg = ShapeUtil::MakeShape(F32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_}, f32_);
-  auto inferred_status = ShapeInference::InferMapShape({&arg}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
-  EXPECT_TRUE(ShapeUtil::Equal(arg, *inferred_status));
-
-  auto inferred_status_error1 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferMapShape({&arg}, to_apply, {0});
+  EXPECT_IS_OK(inferred_shape.status());
+  EXPECT_TRUE(ShapeUtil::Equal(arg, *inferred_shape));
+
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({f32_, f32_}, f32_), {0});
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("arity must match number of arguments"));
 
-  auto inferred_status_error2 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({vector_32_}, f32_), {0});
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("has to be a scalar"));
 
-  auto inferred_status_error3 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({f32_}, vector_32_), {0});
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("has to be a scalar"));
 
-  auto inferred_status_error5 = ShapeInference::InferMapShape(
-      {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error5 =
+      ShapeInference::InferMapShape(
+          {&arg}, ShapeUtil::MakeProgramShape({s32_}, s32_), {0});
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("parameter type has to match argument"));
 }
 
 TEST_F(ShapeInferenceTest, MapWithDifferentInputTypes) {
-  Shape arg0 = ShapeUtil::MakeShape(F32, {20});
-  Shape arg1 = ShapeUtil::MakeShape(S32, {20});
+  const Shape arg0 = ShapeUtil::MakeShape(F32, {20});
+  const Shape arg1 = ShapeUtil::MakeShape(S32, {20});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, s32_}, s32_);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferMapShape({&arg0, &arg1}, to_apply, {0});
-  EXPECT_IS_OK(inferred_status.status());
-  Shape expected = ShapeUtil::MakeShape(S32, {20});
-  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_status));
+  EXPECT_IS_OK(inferred_shape.status());
+  const Shape expected = ShapeUtil::MakeShape(S32, {20});
+  EXPECT_TRUE(ShapeUtil::Equal(expected, *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceVectorToScalar) {
@@ -1205,20 +1252,20 @@ TEST_F(ReduceShapeInferenceTest, ReduceCubeAmongAllDimensions) {
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceMultiOutput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_IS_OK(inferred_status.status());
+  EXPECT_IS_OK(inferred_shape.status());
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeTupleShape({f32_, s32_}),
-                               *inferred_status));
+                               *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
   std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
   std::vector<const Shape*> inits = {&f32_, &s32_};
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
@@ -1229,42 +1276,43 @@ TEST_F(ReduceShapeInferenceTest, ReduceWindowMultiOutput) {
       MakePadding(f32_arg_shape.dimensions(), window_dimensions, window_strides,
                   Padding::kValid);
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
-  VLOG(2) << inferred_status->ToString() << "\n";
-  EXPECT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReduceWindowShape(
+          absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  VLOG(2) << inferred_shape->ToString() << "\n";
+  EXPECT_IS_OK(inferred_shape.status());
   EXPECT_TRUE(ShapeUtil::Equal(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {5, 2, 0}),
                                  ShapeUtil::MakeShape(S32, {5, 2, 0})}),
-      *inferred_status));
+      *inferred_shape));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput1) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_, f32_, s32_},
                                   ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("must take 4 parameters, but takes 6 parameter(s)"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr(
           "parameter shape differs from the result shape: s32[] vs f32[]"));
 }
@@ -1272,15 +1320,16 @@ TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput2) {
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerInput3) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReduceShape({}, {0, 1}, to_apply);
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("must have at least 2 arguments, has 0"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3, 1});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3, 1});
   std::vector<const Shape*> args = {&f32_arg_shape, &s32_arg_shape};
   std::vector<const Shape*> inits = {&f32_, &s32_};
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
@@ -1291,164 +1340,165 @@ TEST_F(ReduceShapeInferenceTest, ErrorBadReduceWindowInput) {
       MakePadding(f32_arg_shape.dimensions(), window_dimensions, window_strides,
                   Padding::kValid);
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           window_dimensions, window_strides, padding_values, {}, {}));
-  auto inferred_status = ShapeInference::InferReduceWindowShape(
-      absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
-  EXPECT_FALSE(inferred_status.status().ok());
-  EXPECT_THAT(inferred_status.status().message(), HasSubstr("f32[] vs s32[]"));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReduceWindowShape(
+          absl::MakeSpan(args), absl::MakeSpan(inits), window, to_apply);
+  EXPECT_FALSE(inferred_shape.status().ok());
+  EXPECT_THAT(inferred_shape.status().message(), HasSubstr("f32[] vs s32[]"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput1) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply =
       ShapeUtil::MakeProgramShape({f32_, s32_, f32_, s32_}, f32_);
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but produces a scalar"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerOutput2) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, s32_, f32_, s32_}, ShapeUtil::MakeTupleShape({f32_, s32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
+  EXPECT_FALSE(inferred_shape.ok());
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("must produce a tuple with 2 elements, but has 3 elements"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorMultiOutputBadReducerBoth) {
-  Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
+  const Shape f32_arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape s32_arg_shape = ShapeUtil::MakeShape(S32, {5, 3});
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {s32_, s32_, s32_, s32_}, ShapeUtil::MakeTupleShape({s32_, s32_}));
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&f32_arg_shape, &s32_arg_shape, &f32_, &s32_}, {0, 1}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("accumulator shape at index 0 differs from the "
                         "init_value shape: s32[] vs f32[]"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorOutOfBoundsDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&arg_shape, &f32_},
       /*dimensions_to_reduce=*/{3, 4}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("out-of-bounds dimension"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorToApplyArity) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status =
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("take 2 parameters"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ErrorElementTypeVsApplyType) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, s32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status =
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReduceShape({&arg_shape, &f32_},
                                        /*dimensions_to_reduce=*/{0}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("0-th parameter shape differs"));
 }
 
 TEST_F(ReduceShapeInferenceTest, ReduceWithRepeatedReduceDimension) {
   ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  auto inferred_status = ShapeInference::InferReduceShape(
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&arg_shape, &f32_},
       /*dimensions_to_reduce=*/{0, 0}, to_apply);
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Duplicate reduction dimension: 0"));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {1, 1});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 64}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceWithDynamicDimensions) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}, {true, true});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64}, {true, true});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {33, 64}, {1, 1});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(
-      ShapeUtil::MakeShape(F32, {1, 64}, {false, true}), *inferred_status));
+      ShapeUtil::MakeShape(F32, {1, 64}, {false, true}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStrides) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {32, 0}, {64, 64}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {16, 16}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank2WithStridesNotIntegral) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {15, 0}, {20, 13}, {2, 4});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {3, 4}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferInvalidStride) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {0, 1});
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_shape.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferOobSliceShapeRank2) {
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status =
+  const Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(matrix_shape, {127, 0}, {129, 2}, {1, 1});
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_status.status().code());
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_EQ(tsl::error::INVALID_ARGUMENT, inferred_shape.status().code());
 }
 
 TEST_F(ShapeInferenceTest, InferSliceShapeRank1) {
-  Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
-  auto inferred_status =
+  const Shape vector_shape = ShapeUtil::MakeShape(F32, {17});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferSliceShape(vector_shape, {2}, {4}, {1});
-  ASSERT_TRUE(inferred_status.ok());
+  ASSERT_TRUE(inferred_shape.ok());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {2}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferConstIndexShape) {
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
-  auto inferred0_status =
+  const Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  const absl::StatusOr<Shape> inferred0_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 0);
-  auto inferred1_status =
+  const absl::StatusOr<Shape> inferred1_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 1);
   ASSERT_IS_OK(inferred0_status.status());
   ASSERT_IS_OK(inferred1_status.status());
@@ -1457,10 +1507,10 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
 }
 
 TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
-  auto inferredNegative_status =
+  const Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  const absl::StatusOr<Shape> inferredNegative_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, -1);
-  auto inferred2_status =
+  const absl::StatusOr<Shape> inferred2_status =
       ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
   ASSERT_FALSE(inferredNegative_status.ok());
   ASSERT_FALSE(inferred2_status.ok());
@@ -1471,20 +1521,22 @@ TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
 }
 
 TEST_F(ShapeInferenceTest, InferPowShape) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kPower, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(ten_floats, *inferred_status));
+  const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kPower, ten_floats, f32_,
+                                         {});
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(ten_floats, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferCompareShape) {
-  auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kCompare, ten_floats, f32_, {});
-  ASSERT_IS_OK(inferred_status.status());
+  const Shape ten_floats = ShapeUtil::MakeShape(F32, {10});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, ten_floats, f32_,
+                                         {});
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}), *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, InferReshapeDegenerateCombine) {
@@ -1493,9 +1545,10 @@ TEST_F(ShapeInferenceTest, InferReshapeDegenerateCombine) {
   // [<=1]
   //
   // Both output dimension can be dynamic, use inferred_dimension to tie-break.
-  auto operand = ShapeUtil::MakeShape(F32, {1, 1}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {1},
-                                                  /*inferred_dimension=*/-1);
+  const Shape operand = ShapeUtil::MakeShape(F32, {1, 1}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {1},
+                                        /*inferred_dimension=*/-1);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {1}, {true}), *status);
 }
 
@@ -1505,9 +1558,10 @@ TEST_F(ShapeInferenceTest, InferReshapeSplit) {
   // [1, 10]
   //
   // Both output dimension can be dynamic, use inferred_dimension to tie-break.
-  auto operand = ShapeUtil::MakeShape(F32, {10}, {true});
-  auto status = ShapeInference::InferReshapeShape(operand, {0}, {1, 10},
-                                                  /*inferred_dimension=*/0);
+  const Shape operand = ShapeUtil::MakeShape(F32, {10}, {true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {0}, {1, 10},
+                                        /*inferred_dimension=*/0);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {1, 10}, {true, false}), *status);
 }
 
@@ -1515,9 +1569,10 @@ TEST_F(ShapeInferenceTest, InferReshapeCombine) {
   // [6, <=10]
   //   | reshape
   // [<=60]
-  auto operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {60},
-                                                  /*inferred_dimension=*/-11);
+  const Shape operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {60},
+                                        /*inferred_dimension=*/-11);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {60}, {true}), *status);
 }
 
@@ -1525,9 +1580,10 @@ TEST_F(ShapeInferenceTest, UnchangedDimension) {
   // [6, <=10]
   //   | reshape
   // [2, 3, <=10]
-  auto operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
-  auto status = ShapeInference::InferReshapeShape(operand, {1, 0}, {2, 3, 10},
-                                                  /*inferred_dimension=*/-11);
+  const Shape operand = ShapeUtil::MakeShape(F32, {6, 10}, {false, true});
+  const auto status =
+      ShapeInference::InferReshapeShape(operand, {1, 0}, {2, 3, 10},
+                                        /*inferred_dimension=*/-11);
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {2, 3, 10}, {false, false, true}),
             *status);
 }
@@ -1536,41 +1592,43 @@ TEST_F(ShapeInferenceTest, InferDynamicBroadcast) {
   // CHECK:
   // %broadcast = s32[15,<=15]{1,0} broadcast(s32[<=15]{0}), dimensions={1}
 
-  auto operand_shape = ShapeUtil::MakeShape(F32, {15}, {true});
-  auto inferred_status =
+  const Shape operand_shape = ShapeUtil::MakeShape(F32, {15}, {true});
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand_shape, {15});
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_EQ(ShapeUtil::MakeShape(F32, {15, 15}, {false, true}),
-            *inferred_status);
+            *inferred_shape);
 }
 
 TEST_F(ShapeInferenceTest, BroadcastScalar) {
   for (auto element_type : {F32, U32, S8}) {
     const Shape scalar_shape = ShapeUtil::MakeShape(element_type, {});
     {  // no-op scalar broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {});
+      const auto status = ShapeInference::InferBroadcastShape(scalar_shape, {});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(scalar_shape, *status));
     }
     const Shape oned_shape = ShapeUtil::MakeShape(element_type, {3});
     {  // scalar -> 1d broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {3});
+      const auto status =
+          ShapeInference::InferBroadcastShape(scalar_shape, {3});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(oned_shape, *status));
     }
     {  // no-op 1d broadcast
-      auto status = ShapeInference::InferBroadcastShape(oned_shape, {});
+      const auto status = ShapeInference::InferBroadcastShape(oned_shape, {});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(oned_shape, *status));
     }
     const Shape twod_shape = ShapeUtil::MakeShape(element_type, {2, 3});
     {  // scalar -> 2d broadcast
-      auto status = ShapeInference::InferBroadcastShape(scalar_shape, {2, 3});
+      const auto status =
+          ShapeInference::InferBroadcastShape(scalar_shape, {2, 3});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(twod_shape, *status));
     }
     {  // 1d -> 2d broadcast
-      auto status = ShapeInference::InferBroadcastShape(oned_shape, {2});
+      const auto status = ShapeInference::InferBroadcastShape(oned_shape, {2});
       ASSERT_IS_OK(status.status());
       ASSERT_TRUE(ShapeUtil::Equal(twod_shape, *status));
     }
@@ -1580,10 +1638,10 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 // scalar <dot> vector: ok
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
   DotDimensionNumbers dot_dnums;
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       f32_, vector_32_, dot_dnums, /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_EQ(*inferred_status, vector_32_);
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_EQ(*inferred_shape, vector_32_);
 }
 
 // 3D <dot> 2D: error
@@ -1591,11 +1649,11 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums,
       /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_TRUE(ShapeUtil::Equal(*inferred_status,
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape,
                                ShapeUtil::MakeShape(F32, {32, 32, 64})));
 }
 
@@ -1604,15 +1662,15 @@ TEST_F(ShapeInferenceTest, VectorDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_status));
-  auto inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // matrix <dot> vector -> vector
@@ -1620,15 +1678,15 @@ TEST_F(ShapeInferenceTest, MatrixDotVector) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_32_));
-  auto inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape, vector_32_));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // vector <dot> matrix -> vector
@@ -1636,15 +1694,15 @@ TEST_F(ShapeInferenceTest, VectorDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status, vector_64_));
-  auto inferred_status_mismatch =
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape, vector_64_));
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // matrix <dot> matrix -> matrix
@@ -1652,24 +1710,24 @@ TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  auto inferred_status_match =
+  const absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, matrix_32_48_))
-      << "inferred: " << ShapeUtil::HumanString(*inferred_status_match)
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, matrix_32_48_))
+      << "inferred: " << ShapeUtil::HumanString(*inferred_shape_match)
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  auto inferred_status_mismatch =
+  const absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 // BatchMatMul with two batch dimensions and one contracting dimension.
 TEST_F(ShapeInferenceTest, DotGeneral) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
+  const Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(3);
@@ -1680,19 +1738,19 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
   dot_dnums.add_rhs_batch_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status_match =
+  const absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, output_shape))
-      << "inferred: " << ShapeUtil::HumanString(*inferred_status_match)
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, output_shape))
+      << "inferred: " << ShapeUtil::HumanString(*inferred_shape_match)
       << " expected: " << ShapeUtil::HumanString(output_shape);
 }
 
 // BatchMatMul with two contracting dimensions fails.
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1702,19 +1760,19 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Must specify the same number of contracting "
                         "dimensions for lhs and rhs."));
 }
 
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
+  const Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1725,39 +1783,41 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
   dot_dnums.add_rhs_contracting_dimensions(2);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  EXPECT_TRUE(inferred_status.ok());
-  EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, output_shape));
+  EXPECT_TRUE(inferred_shape.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, output_shape));
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSize) {
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape val_shape = ShapeUtil::MakeShape(S32, {1});
-  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
-      arg_shape, val_shape, /*dimension=*/0);
-
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape val_shape = ShapeUtil::MakeShape(S32, {1});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
+                                                 /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
 TEST_F(ShapeInferenceTest, ErrorSetDimensionSizeWrongType) {
-  Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
-  Shape val_shape = ShapeUtil::MakeShape(U32, {});
-  auto inferred_status = ShapeInference::InferSetDimensionSizeShape(
-      arg_shape, val_shape, /*dimension=*/0);
-
-  EXPECT_FALSE(inferred_status.ok());
-  EXPECT_THAT(inferred_status.status().message(),
+  const Shape arg_shape = ShapeUtil::MakeShape(F32, {5, 3});
+  const Shape val_shape = ShapeUtil::MakeShape(U32, {});
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferSetDimensionSizeShape(arg_shape, val_shape,
+                                                 /*dimension=*/0);
+
+  EXPECT_FALSE(inferred_shape.ok());
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("value has to be S32 scalar"));
 }
 
 // BatchMatMul with different batch dimension sizes fails.
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1766,18 +1826,18 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimSizesFails) {
   dot_dnums.add_rhs_contracting_dimensions(1);
   dot_dnums.add_rhs_batch_dimensions(0);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Batch dimension sizes are not compatible"));
 }
 
 // BatchMatMul with different batch dimension numbers passes
 TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1786,18 +1846,18 @@ TEST_F(ShapeInferenceTest, DotWithMismatchedBatchDimNumbersPasses) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_TRUE(inferred_status.ok());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status,
+  ASSERT_TRUE(inferred_shape.ok());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape,
                                ShapeUtil::MakeShape(F32, {2, 11, 14})));
 }
 
 // BatchMatMul with out-of-range dimension numbers fails.
 TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(3);
@@ -1806,18 +1866,18 @@ TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("A dimension number is out of range"));
 }
 
 // BatchMatMul with non-unique dimension numbers fails.
 TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  const Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  const Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1826,11 +1886,11 @@ TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   dot_dnums.add_rhs_batch_dimensions(1);
 
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums,
                                       /*preferred_element_type=*/std::nullopt);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_FALSE(inferred_shape.ok());
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("A dimension number is not unique"));
 }
 
@@ -1838,7 +1898,7 @@ TEST_F(ShapeInferenceTest, DotWithIntegralPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1851,7 +1911,7 @@ TEST_F(ShapeInferenceTest, DotWithPreferredElementTypeSameAsInferredType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(F32, {32, 32}), dot_dnums,
@@ -1864,7 +1924,7 @@ TEST_F(ShapeInferenceTest, FloatingPointDotWithNarrowerPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(F32, {32, 32}), dot_dnums,
@@ -1877,7 +1937,7 @@ TEST_F(ShapeInferenceTest, FloatingPointDotWithIntegralPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(BF16, {32, 32}),
                               ShapeUtil::MakeShape(BF16, {32, 32}), dot_dnums,
@@ -1890,7 +1950,7 @@ TEST_F(ShapeInferenceTest, IntegralDotWithFloatingPointPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1903,7 +1963,7 @@ TEST_F(ShapeInferenceTest, DotWithPreferredElementTypeWithDifferentSignedness) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1916,7 +1976,7 @@ TEST_F(ShapeInferenceTest, DotWithNarrowerPreferredElementType) {
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
   dot_dnums.add_rhs_contracting_dimensions(0);
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferDotOpShape(
                               ShapeUtil::MakeShape(S8, {32, 32}),
                               ShapeUtil::MakeShape(S16, {32, 32}), dot_dnums,
@@ -1938,7 +1998,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseLhs) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 16}),
           ShapeUtil::MakeShape(F32, {32, 20}), dot_dnums,
@@ -1960,7 +2020,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseRhs) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 32}),
           ShapeUtil::MakeShape(F32, {16, 20}), dot_dnums,
@@ -1985,7 +2045,7 @@ TEST_F(ShapeInferenceTest, DotWithSparseBothOperands) {
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_lhs, sparsity_rhs};
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(
           ShapeUtil::MakeShape(F32, {10, 16}),
           ShapeUtil::MakeShape(F32, {16, 20}), dot_dnums,
@@ -2006,13 +2066,13 @@ TEST_F(ShapeInferenceTest, DotWithIncorrectSparseDimensionSizeRatio) {
   sparsity_descriptor.set_dimension(1);
 
   std::vector<SparsityDescriptor> sparsity = {sparsity_descriptor};
-  auto inferred_status = ShapeInference::InferDotOpShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {10, 32}), ShapeUtil::MakeShape(F32, {32, 20}),
       dot_dnums, /*preferred_element_type=*/std::nullopt,
       absl::MakeSpan(sparsity));
-  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_FALSE(inferred_shape.ok());
   ASSERT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("Sparse dimension size ratio doesn't match the descriptor"));
 }
 
@@ -2027,7 +2087,7 @@ TEST_F(ShapeInferenceTest, SparseDotMetadata) {
   sparsity_descriptor.set_index(0);
   sparsity_descriptor.set_dimension(2);
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferSparseDotMetadataShape(
                               ShapeUtil::MakeShape(F32, {5, 10, 16}), dot_dnums,
                               sparsity_descriptor));
@@ -2042,23 +2102,23 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   const Shape vec8 = ShapeUtil::MakeShape(F32, {8});
   const Shape vec16 = ShapeUtil::MakeShape(F32, {16});
 
-  auto inferred_status_match =
+  absl::StatusOr<Shape> inferred_shape_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {1});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, mat));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, mat));
 
-  auto inferred_status_mismatch =
+  absl::StatusOr<Shape> inferred_shape_mismatch =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {0});
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 
-  inferred_status_match =
+  inferred_shape_match =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {0});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, mat));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, mat));
 
-  inferred_status_mismatch =
+  inferred_shape_mismatch =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {1});
-  ASSERT_FALSE(inferred_status_mismatch.ok());
+  ASSERT_FALSE(inferred_shape_mismatch.ok());
 }
 
 TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
@@ -2068,20 +2128,21 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
   const Shape matrix16_4 = ShapeUtil::MakeShape(F32, {16, 4});
   const Shape matrix16_8 = ShapeUtil::MakeShape(F32, {16, 8});
 
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, cube, matrix8_4, {1, 2});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  absl::StatusOr<Shape> inferred_shape_match =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, cube, matrix8_4,
+                                         {1, 2});
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 
-  inferred_status_match = ShapeInference::InferBinaryOpShape(
+  inferred_shape_match = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, cube, matrix16_4, {0, 2});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 
-  inferred_status_match = ShapeInference::InferBinaryOpShape(
+  inferred_shape_match = ShapeInference::InferBinaryOpShape(
       HloOpcode::kAdd, cube, matrix16_8, {0, 1});
-  ASSERT_IS_OK(inferred_status_match.status());
-  ASSERT_TRUE(ShapeUtil::Equal(*inferred_status_match, cube));
+  ASSERT_IS_OK(inferred_shape_match.status());
+  ASSERT_TRUE(ShapeUtil::Equal(*inferred_shape_match, cube));
 }
 
 TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
@@ -2093,213 +2154,228 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   const Shape matrix8_8 = ShapeUtil::MakeShape(F32, {8, 8});
 
   // "magical" broadcast rejected
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Shapes must be equal rank"));
 
   // broadcast_dimension out of bounds for tensor's rank
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
-  auto inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
-  auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {0, 1, 2});
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {0, 1, 2});
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
-  auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {3, 0});
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error5 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {3, 0});
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
-  auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor, matrix8_4, {2, 1});
-  ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error6 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, matrix8_4,
+                                         {2, 1});
+  ASSERT_FALSE(inferred_shape_error6.ok());
+  ASSERT_THAT(inferred_shape_error6.status().message(),
               HasSubstr("dimension 0 mismatch"));
 
   // The following two tests make sure that broadcasting dimensions are listed
   // in a proper (strictly increasing) order, even if the lower-rank array
   // matches the higher-rank array in many different ways.
-  auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {0, 0});
-  ASSERT_FALSE(inferred_status_error7.ok());
-  ASSERT_THAT(inferred_status_error7.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error7 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
+                                         matrix8_8, {0, 0});
+  ASSERT_FALSE(inferred_shape_error7.ok());
+  ASSERT_THAT(inferred_shape_error7.status().message(),
               HasSubstr("dimensions order is wrong"));
 
-  auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {1, 0});
-  ASSERT_FALSE(inferred_status_error8.ok());
-  ASSERT_THAT(inferred_status_error8.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error8 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor8_8_8,
+                                         matrix8_8, {1, 0});
+  ASSERT_FALSE(inferred_shape_error8.ok());
+  ASSERT_THAT(inferred_shape_error8.status().message(),
               HasSubstr("dimensions order is wrong"));
 }
 
 // Tests for the while instruction with proper shapes.
 TEST_F(ShapeInferenceTest, WhileWithCorrectShapes) {
-  Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
+  const Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
   ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
   ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
-  auto inferred_status =
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferWhileShape(cond, body, result_shape);
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(result_shape, *inferred_status));
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(result_shape, *inferred_shape));
 }
 
 // Tests for the while instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, WhileWithBadShapes) {
-  Shape result_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
-  ProgramShape cond = ShapeUtil::MakeProgramShape({result_shape}, pred_);
-  ProgramShape body = ShapeUtil::MakeProgramShape({result_shape}, result_shape);
-
-  auto bad_shape_1 = ShapeUtil::MakeProgramShape({s32_, result_shape}, pred_);
-  auto inferred_status_error1 =
-      ShapeInference::InferWhileShape(bad_shape_1, body, result_shape);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  const Shape inferred_shape = ShapeUtil::MakeTupleShape({s32_, vector_32_});
+  ProgramShape cond = ShapeUtil::MakeProgramShape({inferred_shape}, pred_);
+  ProgramShape body =
+      ShapeUtil::MakeProgramShape({inferred_shape}, inferred_shape);
+
+  const auto bad_shape_1 =
+      ShapeUtil::MakeProgramShape({s32_, inferred_shape}, pred_);
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferWhileShape(bad_shape_1, body, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Condition must take 1 arguments"));
 
-  auto bad_shape_2 =
-      ShapeUtil::MakeProgramShape({s32_, result_shape}, result_shape);
-  auto inferred_status_error2 =
-      ShapeInference::InferWhileShape(cond, bad_shape_2, result_shape);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  const auto bad_shape_2 =
+      ShapeUtil::MakeProgramShape({s32_, inferred_shape}, inferred_shape);
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferWhileShape(cond, bad_shape_2, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("Body must take 1 arguments"));
 
-  auto bad_shape_3 = ShapeUtil::MakeProgramShape({result_shape}, s32_);
-  auto inferred_status_error3 =
-      ShapeInference::InferWhileShape(bad_shape_3, body, result_shape);
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  const auto bad_shape_3 = ShapeUtil::MakeProgramShape({inferred_shape}, s32_);
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferWhileShape(bad_shape_3, body, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Condition must return a boolean"));
 
-  auto bad_shape_4 = ShapeUtil::MakeProgramShape({result_shape}, vector_32_);
-  auto inferred_status_error4 =
-      ShapeInference::InferWhileShape(cond, bad_shape_4, result_shape);
-  ASSERT_FALSE(inferred_status_error4.ok());
-  ASSERT_THAT(inferred_status_error4.status().message(),
+  const auto bad_shape_4 =
+      ShapeUtil::MakeProgramShape({inferred_shape}, vector_32_);
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferWhileShape(cond, bad_shape_4, inferred_shape);
+  ASSERT_FALSE(inferred_shape_error4.ok());
+  ASSERT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("parameter of condition and body"));
 }
 
 // Tests for the concatenate instruction with dynamic shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithDynamicShapes) {
-  auto dynamic_shape_1 =
+  const auto dynamic_shape_1 =
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {true, false, false});
-  auto dynamic_shape_2 =
+  const auto dynamic_shape_2 =
       ShapeUtil::MakeShape(F32, {32, 160, 10}, {false, true, false});
-  auto inferred_status = ShapeInference::InferConcatOpShape(
-      {&dynamic_shape_1, &dynamic_shape_2}, /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status.status());
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferConcatOpShape({&dynamic_shape_1, &dynamic_shape_2},
+                                         /*dimension=*/0);
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(ShapeUtil::Equal(
       ShapeUtil::MakeShape(F32, {64, 160, 10}, {true, true, false}),
-      *inferred_status));
+      *inferred_shape));
 }
 
 // Tests for the concatenate instruction with proper shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithCorrectShapes) {
-  auto inferred_status_1 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_64_}, /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status_1.status());
+  const absl::StatusOr<Shape> inferred_shape_1 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &vector_64_},
+                                         /*dimension=*/0);
+  ASSERT_IS_OK(inferred_shape_1.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {96}), *inferred_status_1));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {96}), *inferred_shape_1));
 
-  auto inferred_status_2 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_64_, &vector_32_}, /*dimension=*/0);
-  ASSERT_IS_OK(inferred_status_2.status());
+  const absl::StatusOr<Shape> inferred_shape_2 =
+      ShapeInference::InferConcatOpShape(
+          {&vector_32_, &vector_64_, &vector_32_}, /*dimension=*/0);
+  ASSERT_IS_OK(inferred_shape_2.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {128}), *inferred_status_2));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {128}), *inferred_shape_2));
 
-  auto inferred_status_3 = ShapeInference::InferConcatOpShape(
-      {&matrix_32_48_, &matrix_32_64_, &matrix_32_48_}, /*dimension=*/1);
-  ASSERT_IS_OK(inferred_status_3.status());
+  const absl::StatusOr<Shape> inferred_shape_3 =
+      ShapeInference::InferConcatOpShape(
+          {&matrix_32_48_, &matrix_32_64_, &matrix_32_48_}, /*dimension=*/1);
+  ASSERT_IS_OK(inferred_shape_3.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {32, 160}),
-                               *inferred_status_3));
+                               *inferred_shape_3));
 }
 
 // Tests for the concatenate instruction with wrong shapes.
 TEST_F(ShapeInferenceTest, ConcatenateWithBadShapes) {
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferConcatOpShape({}, /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("Concatenate expects at least one argument"));
 
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/-1);
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("dimension out of bounds: -1"));
 
-  auto inferred_status_error3 =
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferConcatOpShape({&vector_32_}, /*dimension=*/1);
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("dimension out of bounds: 1"));
 
-  Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
-  auto inferred_status_error4 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &tuple}, /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error4.ok());
+  const Shape tuple = ShapeUtil::MakeTupleShape({vector_32_});
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &tuple},
+                                         /*dimension=*/0);
+  ASSERT_FALSE(inferred_shape_error4.ok());
   ASSERT_THAT(
-      inferred_status_error4.status().message(),
+      inferred_shape_error4.status().message(),
       HasSubstr("Expected array argument for operand of concatenation"));
 
   const Shape vector_s32 = ShapeUtil::MakeShape(S32, {32});
-  auto inferred_status_error5 = ShapeInference::InferConcatOpShape(
-      {&vector_32_, &vector_s32}, /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error5.ok());
-  ASSERT_THAT(inferred_status_error5.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error5 =
+      ShapeInference::InferConcatOpShape({&vector_32_, &vector_s32},
+                                         /*dimension=*/0);
+  ASSERT_FALSE(inferred_shape_error5.ok());
+  ASSERT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("concatenate arrays with different element types"));
 
-  auto inferred_status_error6 = ShapeInference::InferConcatOpShape(
-      {&matrix_32_48_, &matrix_32_64_}, /*dimension=*/0);
-  ASSERT_FALSE(inferred_status_error6.ok());
-  ASSERT_THAT(inferred_status_error6.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error6 =
+      ShapeInference::InferConcatOpShape({&matrix_32_48_, &matrix_32_64_},
+                                         /*dimension=*/0);
+  ASSERT_FALSE(inferred_shape_error6.ok());
+  ASSERT_THAT(inferred_shape_error6.status().message(),
               HasSubstr("concatenate arrays that differ in "
                         "dimensions other than the one being "
                         "concatenated"));
 }
 
 TEST_F(ShapeInferenceTest, Pad) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
-  Shape padding_value_shape = ShapeUtil::MakeShape(F32, {});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape padding_value_shape = ShapeUtil::MakeShape(F32, {});
   // Padding for dimension 0: {low: 0, high: 2, interior: 3}
   // Padding for dimension 1: {low: 1, high: 5, interior: 0}
   PaddingConfig padding_config;
-  auto dimension0 = padding_config.add_dimensions();
+  const auto dimension0 = padding_config.add_dimensions();
   dimension0->set_edge_padding_low(0);
   dimension0->set_edge_padding_high(2);
   dimension0->set_interior_padding(3);
-  auto dimension1 = padding_config.add_dimensions();
+  const auto dimension1 = padding_config.add_dimensions();
   dimension1->set_edge_padding_low(1);
   dimension1->set_edge_padding_high(5);
   dimension1->set_interior_padding(0);
 
-  auto inferred_status = ShapeInference::InferPadShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
-  ASSERT_IS_OK(inferred_status.status());
+  ASSERT_IS_OK(inferred_shape.status());
   ASSERT_TRUE(
-      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), *inferred_status));
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), *inferred_shape));
 
   dimension1->set_edge_padding_low(-20);
   dimension1->set_edge_padding_high(-10);
-  auto negative_dimension_size = ShapeInference::InferPadShape(
+  const auto negative_dimension_size = ShapeInference::InferPadShape(
       input_shape, padding_value_shape, padding_config);
   ASSERT_FALSE(negative_dimension_size.ok());
   ASSERT_THAT(negative_dimension_size.status().message(),
@@ -2307,77 +2383,82 @@ TEST_F(ShapeInferenceTest, Pad) {
 }
 
 TEST_F(ShapeInferenceTest, Reverse) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  auto inferred_status = ShapeInference::InferReverseShape(input_shape, {0, 1});
-  ASSERT_IS_OK(inferred_status.status());
-  ASSERT_TRUE(ShapeUtil::Equal(input_shape, *inferred_status));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReverseShape(input_shape, {0, 1});
+  ASSERT_IS_OK(inferred_shape.status());
+  ASSERT_TRUE(ShapeUtil::Equal(input_shape, *inferred_shape));
 }
 
 TEST_F(ShapeInferenceTest, ReverseInvalidDimension) {
-  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {10, 25});
 
-  auto inferred_status_error0 =
+  const absl::StatusOr<Shape> inferred_shape_error0 =
       ShapeInference::InferReverseShape(input_shape, {0, 2});
-  ASSERT_FALSE(inferred_status_error0.ok());
-  ASSERT_THAT(inferred_status_error0.status().message(),
+  ASSERT_FALSE(inferred_shape_error0.ok());
+  ASSERT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("out-of-bounds"));
 
-  auto inferred_status_error1 =
+  const absl::StatusOr<Shape> inferred_shape_error1 =
       ShapeInference::InferReverseShape(input_shape, {0, -1});
-  ASSERT_FALSE(inferred_status_error1.ok());
-  ASSERT_THAT(inferred_status_error1.status().message(),
+  ASSERT_FALSE(inferred_shape_error1.ok());
+  ASSERT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("out-of-bounds"));
 
-  auto inferred_status_error2 =
+  const absl::StatusOr<Shape> inferred_shape_error2 =
       ShapeInference::InferReverseShape(input_shape, {0, 0});
-  ASSERT_FALSE(inferred_status_error2.ok());
-  ASSERT_THAT(inferred_status_error2.status().message(),
+  ASSERT_FALSE(inferred_shape_error2.ok());
+  ASSERT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("duplicated"));
 
-  Shape tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
-  auto inferred_status_error3 =
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({input_shape, input_shape});
+  const absl::StatusOr<Shape> inferred_shape_error3 =
       ShapeInference::InferReverseShape(tuple_shape, {0});
-  ASSERT_FALSE(inferred_status_error3.ok());
-  ASSERT_THAT(inferred_status_error3.status().message(),
+  ASSERT_FALSE(inferred_shape_error3.ok());
+  ASSERT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("Expected array argument"));
 }
 
 TEST_F(ShapeInferenceTest, Call) {
-  auto inferred_status0 =
+  const absl::StatusOr<Shape> inferred_shape0 =
       ShapeInference::InferCallShape({}, ShapeUtil::MakeProgramShape({}, f32_));
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
 
-  auto inferred_status1 = ShapeInference::InferCallShape(
+  const absl::StatusOr<Shape> inferred_shape1 = ShapeInference::InferCallShape(
       {&f32_, &s32_, &pred_, &vector_32_, &matrix_32_48_},
       ShapeUtil::MakeProgramShape(
           {f32_, s32_, pred_, vector_32_, matrix_32_48_}, s32matrix_64_64_));
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_status1));
-
-  auto inferred_status_error0 = ShapeInference::InferCallShape(
-      {}, ShapeUtil::MakeProgramShape({f32_}, f32_));
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(s32matrix_64_64_, *inferred_shape1));
+
+  const absl::StatusOr<Shape> inferred_shape_error0 =
+      ShapeInference::InferCallShape({},
+                                     ShapeUtil::MakeProgramShape({f32_}, f32_));
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("arity must match"));
 
-  auto inferred_status_error1 = ShapeInference::InferCallShape(
-      {&f32_}, ShapeUtil::MakeProgramShape({}, f32_));
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferCallShape({&f32_},
+                                     ShapeUtil::MakeProgramShape({}, f32_));
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("arity must match"));
 
-  auto inferred_status_error2 = ShapeInference::InferCallShape(
-      {&f32_}, ShapeUtil::MakeProgramShape({s32_}, f32_));
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferCallShape({&f32_},
+                                     ShapeUtil::MakeProgramShape({s32_}, f32_));
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("parameter must match argument"));
 }
 
 TEST_F(ShapeInferenceTest, Transpose) {
-  Shape a_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
-  auto inferred_shape_and_status =
+  const Shape a_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5});
+  const absl::StatusOr<Shape> inferred_shape_and_status =
       ShapeInference::InferTransposeShape(a_shape, {1, 2, 3, 0});
   EXPECT_IS_OK(inferred_shape_and_status);
   EXPECT_TRUE(ShapeUtil::Compatible(ShapeUtil::MakeShape(F32, {3, 4, 5, 2}),
@@ -2385,8 +2466,8 @@ TEST_F(ShapeInferenceTest, Transpose) {
 }
 
 TEST_F(ShapeInferenceTest, Rank1Transpose) {
-  Shape a_shape = ShapeUtil::MakeShape(F32, {5});
-  auto inferred_shape_and_status =
+  const Shape a_shape = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> inferred_shape_and_status =
       ShapeInference::InferTransposeShape(a_shape, {0});
   EXPECT_IS_OK(inferred_shape_and_status);
   EXPECT_TRUE(ShapeUtil::Compatible(ShapeUtil::MakeShape(F32, {5}),
@@ -2394,193 +2475,211 @@ TEST_F(ShapeInferenceTest, Rank1Transpose) {
 }
 
 TEST_F(ShapeInferenceTest, ConditionalPred) {
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
-
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
-       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
-      {matrix_32_48_, vector_32_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
-
-  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  auto inferred_status2 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
-      {matrix_32_48_, tuple_f32_v32});
-  EXPECT_IS_OK(inferred_status2.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
-
-  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
-      f32_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  const absl::StatusOr<Shape> inferred_shape0 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
+
+  const absl::StatusOr<Shape> inferred_shape1 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+           ShapeUtil::MakeProgramShape({vector_32_}, vector_64_)},
+          {matrix_32_48_, vector_32_});
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_shape1));
+
+  const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  const absl::StatusOr<Shape> inferred_shape2 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+          {matrix_32_48_, tuple_f32_v32});
+  EXPECT_IS_OK(inferred_shape2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape2));
+
+  const absl::StatusOr<Shape> inferred_shape_error0 =
+      ShapeInference::InferConditionalShape(
+          f32_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("must be bool or int32_t"));
 
-  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
-      {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+          {ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_});
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("branch computation 0 must take 1 argument"));
 
-  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("branch operand 0 must match the shape of the only "
                         "parameter of branch computation 0"));
 
-  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
-      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
-  EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_)},
+          {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_})});
+  EXPECT_FALSE(inferred_shape_error3.ok());
+  EXPECT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  auto inferred_status_error4 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
-      {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error4 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+          {vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error4.ok());
+  EXPECT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("branch operand 1 must match the shape of the only "
                         "parameter of branch computation 1"));
 
-  auto inferred_status_error5 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
-      {vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error5.ok());
-  EXPECT_THAT(inferred_status_error5.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error5 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+          {vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error5.ok());
+  EXPECT_THAT(inferred_shape_error5.status().message(),
               HasSubstr("the result of branch 0 computation and branch 1 "
                         "computation must have the same shape"));
 }
 
 TEST_F(ShapeInferenceTest, ConditionalIndexed) {
-  auto r0s32 = ShapeUtil::MakeShape(S32, {});
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_status0));
-
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
-       ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
-      {matrix_32_48_, vector_32_, matrix_32_48_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_status1));
-
-  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
-  auto inferred_status2 = ShapeInference::InferConditionalShape(
-      r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
-      {tuple_f32_v32});
-  EXPECT_IS_OK(inferred_status2.status());
-  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_status2));
-
-  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
-      pred_,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
-      {vector_32_, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error0.ok());
-  EXPECT_THAT(inferred_status_error0.status().message(),
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  const absl::StatusOr<Shape> inferred_shape0 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_64_, vector_64_});
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, *inferred_shape0));
+
+  const absl::StatusOr<Shape> inferred_shape1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+           ShapeUtil::MakeProgramShape({vector_32_}, vector_64_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_)},
+          {matrix_32_48_, vector_32_, matrix_32_48_});
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, *inferred_shape1));
+
+  const auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  const absl::StatusOr<Shape> inferred_shape2 =
+      ShapeInference::InferConditionalShape(
+          r0s32, {ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_)},
+          {tuple_f32_v32});
+  EXPECT_IS_OK(inferred_shape2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, *inferred_shape2));
+
+  const absl::StatusOr<Shape> inferred_shape_error0 =
+      ShapeInference::InferConditionalShape(
+          pred_,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, f32_)},
+          {vector_32_, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error0.ok());
+  EXPECT_THAT(inferred_shape_error0.status().message(),
               HasSubstr("2 == branch_computations.size()"));
 
-  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
-       ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
-       ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
-      {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
-       matrix_32_48_});
-  EXPECT_FALSE(inferred_status_error1.ok());
-  EXPECT_THAT(inferred_status_error1.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+           ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+           ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_)},
+          {matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+           matrix_32_48_});
+  EXPECT_FALSE(inferred_shape_error1.ok());
+  EXPECT_THAT(inferred_shape_error1.status().message(),
               HasSubstr("branch computation 1 must take 1 argument"));
 
-  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({r0s32}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
-      {r0s32, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error2.ok());
-  EXPECT_THAT(inferred_status_error2.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error2 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({r0s32}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_)},
+          {r0s32, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error2.ok());
+  EXPECT_THAT(inferred_shape_error2.status().message(),
               HasSubstr("branch operand 2 must match the shape of the only "
                         "parameter of branch computation 2"));
 
-  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_32_}, f32_),
-       ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
-      {vector_32_, vector_32_, vector_32_, vector_64_});
-  EXPECT_FALSE(inferred_status_error3.ok());
-  EXPECT_THAT(inferred_status_error3.status().message(),
+  const absl::StatusOr<Shape> inferred_shape_error3 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+           ShapeUtil::MakeProgramShape({vector_64_}, vector_32_)},
+          {vector_32_, vector_32_, vector_32_, vector_64_});
+  EXPECT_FALSE(inferred_shape_error3.ok());
+  EXPECT_THAT(inferred_shape_error3.status().message(),
               HasSubstr("the result of branch 0 computation and branch 3 "
                         "computation must have the same shape"));
 
-  auto inferred_status_error4 =
+  const absl::StatusOr<Shape> inferred_shape_error4 =
       ShapeInference::InferConditionalShape(r0s32, {}, {});
-  EXPECT_FALSE(inferred_status_error4.ok());
-  EXPECT_THAT(inferred_status_error4.status().message(),
+  EXPECT_FALSE(inferred_shape_error4.ok());
+  EXPECT_THAT(inferred_shape_error4.status().message(),
               HasSubstr("!branch_computations.empty()"));
 }
 
 TEST_F(ShapeInferenceTest, ConditionalDynamic) {
-  auto r0s32 = ShapeUtil::MakeShape(S32, {});
-  auto static_shape = ShapeUtil::MakeShape(S32, {4}, {false});
-  auto dynamic_shape = ShapeUtil::MakeShape(S32, {4}, {true});
-  auto inferred_status0 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, static_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
-      {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status0.status());
-  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status0));
-
-  auto inferred_status1 = ShapeInference::InferConditionalShape(
-      r0s32,
-      {ShapeUtil::MakeProgramShape({vector_32_}, dynamic_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, static_shape),
-       ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
-      {vector_32_, vector_64_, vector_64_});
-  EXPECT_IS_OK(inferred_status1.status());
-  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_status1));
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  const Shape static_shape = ShapeUtil::MakeShape(S32, {4}, {false});
+  const Shape dynamic_shape = ShapeUtil::MakeShape(S32, {4}, {true});
+  const absl::StatusOr<Shape> inferred_shape0 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, static_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
+          {vector_32_, vector_64_, vector_64_});
+  EXPECT_IS_OK(inferred_shape0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_shape0));
+
+  const absl::StatusOr<Shape> inferred_shape1 =
+      ShapeInference::InferConditionalShape(
+          r0s32,
+          {ShapeUtil::MakeProgramShape({vector_32_}, dynamic_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, static_shape),
+           ShapeUtil::MakeProgramShape({vector_64_}, dynamic_shape)},
+          {vector_32_, vector_64_, vector_64_});
+  EXPECT_IS_OK(inferred_shape1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(dynamic_shape, *inferred_shape1));
 }
 
 TEST_F(ShapeInferenceTest, BadSlice) {
-  auto arg = ShapeUtil::MakeShape(F32, {4});
-  absl::StatusOr<Shape> statusor =
+  const Shape arg = ShapeUtil::MakeShape(F32, {4});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferSliceShape(arg, {0}, {5}, {1});
   ASSERT_FALSE(statusor.ok());
 
@@ -2594,9 +2693,9 @@ TEST_F(ShapeInferenceTest, BadSlice) {
 }
 
 TEST_F(ShapeInferenceTest, BadSort) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values = ShapeUtil::MakeShape(F32, {5});
-  absl::StatusOr<Shape> statusor =
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&keys, &values});
   EXPECT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
@@ -2604,10 +2703,10 @@ TEST_F(ShapeInferenceTest, BadSort) {
 }
 
 TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values_good = ShapeUtil::MakeShape(F32, {4});
-  auto values_bad = ShapeUtil::MakeShape(F32, {5});
-  absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_good = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_bad = ShapeUtil::MakeShape(F32, {5});
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
       HloOpcode::kSort, {&keys, &values_good, &values_bad});
   EXPECT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().message(), HasSubstr("dimensions must match"))
@@ -2615,21 +2714,22 @@ TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
 }
 
 TEST_F(ShapeInferenceTest, SortManyValues) {
-  auto keys = ShapeUtil::MakeShape(F32, {4});
-  auto values_s32 = ShapeUtil::MakeShape(S32, {4});
-  auto values_u32 = ShapeUtil::MakeShape(U32, {4});
-  absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+  const Shape keys = ShapeUtil::MakeShape(F32, {4});
+  const Shape values_s32 = ShapeUtil::MakeShape(S32, {4});
+  const Shape values_u32 = ShapeUtil::MakeShape(U32, {4});
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
       HloOpcode::kSort, {&keys, &values_s32, &values_u32});
   EXPECT_IS_OK(statusor);
-  Shape inferred_shape = *statusor;
+  const Shape inferred_shape = *statusor;
   EXPECT_TRUE(ShapeUtil::Compatible(
       inferred_shape,
       ShapeUtil::MakeTupleShape({keys, values_s32, values_u32})));
 }
 
 TEST_F(ShapeInferenceTest, GoodTopK) {
-  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
-  absl::StatusOr<Shape> s = ShapeInference::InferTopKShape(input, /*k=*/2);
+  const Shape input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  const absl::StatusOr<Shape> s =
+      ShapeInference::InferTopKShape(input, /*k=*/2);
   ASSERT_IS_OK(s.status());
   ASSERT_TRUE(ShapeUtil::Equal(
       *s, ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 4, 2}),
@@ -2637,8 +2737,8 @@ TEST_F(ShapeInferenceTest, GoodTopK) {
 }
 
 TEST_F(ShapeInferenceTest, FailTopKLargeK) {
-  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
-  absl::StatusOr<Shape> statusor =
+  const Shape input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  const absl::StatusOr<Shape> statusor =
       ShapeInference::InferTopKShape(input, /*k=*/10);
   EXPECT_FALSE(statusor.ok());
 }
@@ -2648,7 +2748,7 @@ TEST_F(ShapeInferenceTest, InferStochasticConvertShape) {
   const Shape random = ShapeUtil::MakeShape(U32, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto inferred_sr_shape =
+  const absl::StatusOr<Shape> inferred_sr_shape =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   EXPECT_TRUE(inferred_sr_shape.ok());
   EXPECT_TRUE(ShapeUtil::Equal(*inferred_sr_shape, expected_shape));
@@ -2659,7 +2759,7 @@ TEST_F(ShapeInferenceTest, InvalidStochasticConvert_MismatchRandomElementType) {
   const Shape random = ShapeUtil::MakeShape(U16, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto status_or =
+  const auto status_or =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
@@ -2674,7 +2774,7 @@ TEST_F(ShapeInferenceTest,
   const Shape random = ShapeUtil::MakeShape(S32, {4, 3});
   const Shape expected_shape = ShapeUtil::MakeShape(S8, {4, 3});
 
-  auto status_or =
+  const auto status_or =
       ShapeInference::InferStochasticConvertShape(operand, random, S8);
   ASSERT_FALSE(status_or.ok());
   EXPECT_THAT(
@@ -2703,7 +2803,7 @@ class GatherShapeInferenceTest : public ShapeInferenceTest {
 };
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGather) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2718,7 +2818,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGather) {
 }
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_vector_32_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2733,7 +2833,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherV2) {
 }
 
 TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               matrix_64_48_, s64_4d_tensor_10_9_8_7_1_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2749,7 +2849,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowGatherNd) {
 
 TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2766,7 +2866,7 @@ TEST_F(GatherShapeInferenceTest, TensorFlowBatchDynamicSlice) {
 
 TEST_F(GatherShapeInferenceTest, DynamicGatherEntireDimension) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 1}, {false, true, false}),
           ShapeUtil::MakeShape(S64, {}),
@@ -2783,7 +2883,7 @@ TEST_F(GatherShapeInferenceTest, DynamicGatherEntireDimension) {
 
 TEST_F(GatherShapeInferenceTest, DynamicGatherCollapsedDimension) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 1}, {true, false, false}),
           ShapeUtil::MakeShape(S64, {}),
@@ -2800,7 +2900,7 @@ TEST_F(GatherShapeInferenceTest, DynamicGatherCollapsedDimension) {
 
 TEST_F(GatherShapeInferenceTest, DynamicIndices) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           ShapeUtil::MakeShape(F32, {3, 2, 2}),
           ShapeUtil::MakeShape(S64, {3, 4, 2}, {false, true, false}),
@@ -2818,7 +2918,7 @@ TEST_F(GatherShapeInferenceTest, DynamicIndices) {
 
 TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2836,7 +2936,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_A) {
 
 TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape gather_shape,
+      const Shape gather_shape,
       ShapeInference::InferGatherShape(
           f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_5_10_9_7_6_,
           HloGatherInstruction::MakeGatherDimNumbers(
@@ -2854,7 +2954,7 @@ TEST_F(GatherShapeInferenceTest, NonDefaultGatherIndicesLeafDim_B) {
 
 TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) {
   // This is equivalent to a dynamic slice.
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_vector_5_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2872,7 +2972,7 @@ TEST_F(GatherShapeInferenceTest, NoOutputGatherDims) {
 TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) {
   // The gather indices "tensor" is a scalar S here that's used to slice out
   // [S,0,0,0,0]..[S,30,29,28,27] into a [30,29,28,27] shaped result.
-  TF_ASSERT_OK_AND_ASSIGN(Shape gather_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape gather_shape,
                           ShapeInference::InferGatherShape(
                               f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
                               HloGatherInstruction::MakeGatherDimNumbers(
@@ -2888,7 +2988,7 @@ TEST_F(GatherShapeInferenceTest, ScalarGatherIndices) {
 }
 
 TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       tuple_shape_, s64_vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2903,7 +3003,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedTensorInput) {
 }
 
 TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, tuple_shape_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2918,7 +3018,7 @@ TEST_F(GatherShapeInferenceTest, TupleShapedGatherIndicesInput) {
 }
 
 TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       s64_vector_32_, vector_32_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{0},
@@ -2934,7 +3034,7 @@ TEST_F(GatherShapeInferenceTest, FloatingPointGatherIndicesInput) {
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_NonAscendingWindowIndices) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 8, 7},
@@ -2951,7 +3051,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedWindowIndices) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 7},
@@ -2968,7 +3068,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowIndexOutOfBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 99, 100, 101},
@@ -2984,7 +3084,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowIndexBarelyOutOfBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 9},
@@ -3000,7 +3100,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingElidedWindowDims) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3018,7 +3118,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_OutOfBoundsWindowToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3035,7 +3135,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedWindowToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3052,7 +3152,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3070,7 +3170,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_OutOfBoundsGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3086,7 +3186,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_RepeatedGatherToInputMapping) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3103,7 +3203,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_NonAscendingElidedWindowDims) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3118,7 +3218,7 @@ TEST_F(GatherShapeInferenceTest,
 }
 
 TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7},
@@ -3135,7 +3235,7 @@ TEST_F(GatherShapeInferenceTest, InvalidGatherDimNumbers_WindowBoundsTooLarge) {
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_MismatchingNumberOfWindowBounds) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3152,7 +3252,7 @@ TEST_F(GatherShapeInferenceTest,
 
 TEST_F(GatherShapeInferenceTest,
        InvalidGatherDimNumbers_WindowBoundsNot1ForElidedDim) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_8_7_5_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7},
@@ -3169,7 +3269,7 @@ TEST_F(GatherShapeInferenceTest,
 }
 
 TEST_F(GatherShapeInferenceTest, OutOfBoundsGatherIndicesLeafDim) {
-  absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferGatherShape(
       f32_5d_tensor_50_49_48_47_46_, s64_4d_tensor_10_9_5_7_6_,
       HloGatherInstruction::MakeGatherDimNumbers(
           /*offset_dims=*/{4, 5, 6, 7, 8},
@@ -3254,8 +3354,8 @@ class ScatterShapeInferenceTest
 };
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 32}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 32}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3268,8 +3368,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 48}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 48}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3282,8 +3382,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithFullUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {10, 32}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {10, 32}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3296,8 +3396,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 8}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 8}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3310,8 +3410,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithPartialUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {65, 32}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {65, 32}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3327,8 +3427,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 49}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {32, 49}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{1},
@@ -3344,8 +3444,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesBiggerThanInputV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndices) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 31}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {64, 31}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3362,8 +3462,8 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndicesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_vector(32), {31, 48}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_vector(32), {31, 48}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{1},
@@ -3380,9 +3480,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterWithUpdatesNotMatchingIndicesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 48}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 48}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3395,9 +3495,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 64}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 64}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3410,9 +3510,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithFullUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdates) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 10}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 10}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3425,9 +3525,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdates) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 12}, types());
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 12}, types());
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3440,9 +3540,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithPartialUpdatesV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {10, 9, 8, 7, 65}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {10, 9, 8, 7, 65}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4},
@@ -3458,9 +3558,9 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesBiggerThanInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesNotMatchingIndices) {
-  auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
-                             {9, 9, 8, 7, 64}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({64, 48}, s64_tensor({10, 9, 8, 7, 1}),
+                                   {9, 9, 8, 7, 64}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4},
@@ -3477,10 +3577,11 @@ TEST_P(ScatterShapeInferenceTest, TfScatterNdWithUpdatesNotMatchingIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, TfBatchDynamicUpdateSlice) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3494,10 +3595,11 @@ TEST_P(ScatterShapeInferenceTest, TfBatchDynamicUpdateSlice) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDim) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 5, 7, 6}),
-                             {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 5, 7, 6}),
+                   {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3512,10 +3614,11 @@ TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDim) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({5, 10, 9, 7, 6}),
-                             {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({5, 10, 9, 7, 6}),
+                   {10, 9, 7, 6, 30, 29, 28, 27, 26}, types());
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3530,11 +3633,11 @@ TEST_P(ScatterShapeInferenceTest, NonDefaultScatterIndicesLeafDimV2) {
 }
 
 TEST_P(ScatterShapeInferenceTest, NoUpdateScatterDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_vector(5),
-                             {30, 29, 28, 27, 26}, types());
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_vector(5),
+                                   {30, 29, 28, 27, 26}, types());
   // This is equivalent to a dynamic update slice.
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape scatter_shape,
+      const Shape scatter_shape,
       ShapeInference::InferScatterShape(
           shapes.ptrs, to_apply(types()),
           HloScatterInstruction::MakeScatterDimNumbers(
@@ -3549,11 +3652,11 @@ TEST_P(ScatterShapeInferenceTest, NoUpdateScatterDims) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScalarScatterIndices) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
-                             {30, 29, 28, 27}, types());
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
+                                   {30, 29, 28, 27}, types());
   // The scalar indices "tensor" is a scalar S here that's used to update a
   // [30,29,28,27] shaped tensor within the operand at position S.
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_shape,
                           ShapeInference::InferScatterShape(
                               shapes.ptrs, to_apply(types()),
                               HloScatterInstruction::MakeScatterDimNumbers(
@@ -3568,11 +3671,11 @@ TEST_P(ScatterShapeInferenceTest, ScalarScatterIndices) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedTensorInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&tuple_shape, &s64_vector_32, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3586,11 +3689,11 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedTensorInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedScatterIndicesInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &tuple_shape, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3604,11 +3707,11 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedScatterIndicesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) {
-  Shape tuple_shape =
+  const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1}),
                                  ShapeUtil::MakeShape(S64, {10, 9, 8, 7, 1})});
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &s64_vector_32, &tuple_shape}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3622,8 +3725,8 @@ TEST_P(ScatterShapeInferenceTest, ScatterWithTupleShapedUpdatesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, FloatingPointScatterIndicesInput) {
-  Shape s64_vector_32 = s64_vector(32);
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const Shape s64_vector_32 = s64_vector(32);
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       {&s64_vector_32, &vector_32_, &s64_vector_32}, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0},
@@ -3637,9 +3740,10 @@ TEST_P(ScatterShapeInferenceTest, FloatingPointScatterIndicesInput) {
 }
 
 TEST_P(ScatterShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3654,9 +3758,10 @@ TEST_P(ScatterShapeInferenceTest, OutOfBoundsScatterIndicesLeafDim) {
 }
 
 TEST_P(ScatterShapeInferenceTest, InvalidUpdates) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 50}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 50}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3672,9 +3777,10 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdates) {
 TEST_P(ScatterShapeInferenceTest, InvalidUpdateComputation) {
   const ProgramShape invalid_update_computation =
       ShapeUtil::MakeProgramShape({f32_}, f32_);
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, invalid_update_computation,
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3691,9 +3797,10 @@ TEST_P(ScatterShapeInferenceTest, InvalidUpdateComputation) {
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_NonAscendingUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 8, 7},
@@ -3708,9 +3815,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 7, 7},
@@ -3725,9 +3833,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsUpdateWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28, 27, 26}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6, 7, 9},
@@ -3743,9 +3852,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_NonAscendingInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3760,9 +3870,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3777,9 +3888,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsInsertedWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3795,9 +3907,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_MismatchingScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3815,9 +3928,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_OutOfBoundsScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3833,9 +3947,10 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_RepeatedValuesInScatterDimsToOperandDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
-                             {10, 9, 8, 7, 30, 29, 28}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes =
+      CreateShapes({50, 49, 48, 47, 46}, s64_tensor({10, 9, 8, 7, 5}),
+                   {10, 9, 8, 7, 30, 29, 28}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{4, 5, 6},
@@ -3852,9 +3967,9 @@ TEST_P(ScatterShapeInferenceTest,
 
 TEST_P(ScatterShapeInferenceTest,
        InvalidScatterDimNumbers_InsufficientWindowDims) {
-  auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
-                             {30, 29, 28, 27}, types());
-  absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+  const auto shapes = CreateShapes({50, 49, 48, 47, 46}, scalar(S64),
+                                   {30, 29, 28, 27}, types());
+  const absl::StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
       shapes.ptrs, to_apply(types()),
       HloScatterInstruction::MakeScatterDimNumbers(
           /*update_window_dims=*/{0, 1, 2, 3},
@@ -3883,8 +3998,9 @@ INSTANTIATE_TEST_SUITE_P(All, ScatterShapeInferenceTest,
                          ScatterTestName());
 
 TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedUnaryOps) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape(GetParam().operand));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape(GetParam().operand));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                          ParseShape(GetParam().expected));
   TF_ASSERT_OK_AND_ASSIGN(
       const Shape inferred,
       ShapeInference::InferUnaryOpShape(GetParam().opcode, operand));
@@ -3894,249 +4010,286 @@ TEST_P(UnboundedUnaryOpShapeInferenceTest, UnboundedUnaryOps) {
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAdd) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAdd, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
-TEST_P(UnboundedAndOpShapeInferenceTest, UnboundedAnd) {
+TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedAnd) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAnd, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_shape.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
+TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedAtan2) {
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kAnd, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAtan2, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
     TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
+TEST_F(ShapeInferenceTest, UnboundedBitcastConvert) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferBitcastConvertShape(operand, PrimitiveType::F16));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f16[?, 10, 2]"));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedBatchNormGrad) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_scale, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_offset, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape grad_output, ParseShape("f32[5, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_scale, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_offset, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape grad_output, ParseShape("f32[5, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferBatchNormGradShape(
                               operand, scale, mean, variance, grad_output, 1));
-  Shape expected_tuple_shape =
+  const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({grad_operand, grad_scale, grad_offset});
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected_tuple_shape))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected_tuple_shape);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBatchNormInference) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape mean, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape variance, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape mean, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape variance, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferBatchNormInferenceShape(
                               operand, scale, offset, mean, variance, 1));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 7]"));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 7]"));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBatchNormTraining) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape output, ParseShape("f32[?, ?, 7]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scale, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape offset, ParseShape("f32[5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_mean, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape batch_var, ParseShape("f32[?]"));
-  Shape expected_tuple_shape =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape output, ParseShape("f32[?, ?, 7]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scale, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape offset, ParseShape("f32[5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_mean, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape batch_var, ParseShape("f32[?]"));
+  const Shape expected_tuple_shape =
       ShapeUtil::MakeTupleShape({output, batch_mean, batch_var});
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferBatchNormTrainingShape(operand, scale, offset, 1));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected_tuple_shape))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected_tuple_shape))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected_tuple_shape);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedOperand) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, ?]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, ?]"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand, /*broadcast_sizes=*/{1});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("is_unbounded_dynamic"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastUnsupportedBroadcastSize) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, 4]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBroadcastShape(
-      operand, /*broadcast_sizes=*/{Shape::kUnboundedSize});
-  EXPECT_THAT(inferred_status.status().message(),
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBroadcastShape(
+          operand, /*broadcast_sizes=*/{Shape::kUnboundedSize});
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDim) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, 4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, 4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_status,
+      const Shape inferred_shape,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
-      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimToBounded) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, <=4]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, <=4]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape inferred_status,
+      const Shape inferred_shape,
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(inferred_status, expected))
-      << "inferred: " << ShapeUtil::HumanString(inferred_status)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupportedOutput) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, 3, ?]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=2, 3, ?]"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferBroadcastShape(operand, expected,
                                           /*broadcast_dimensions=*/{0, 2});
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("is_unbounded_dynamic"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedBroadcastInDimUnsupported) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[<=2, 4]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBroadcastShape(
-      operand, /*broadcast_sizes=*/{2, Shape::kUnboundedSize, 4});
-  EXPECT_THAT(inferred_status.status().message(),
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[<=2, 4]"));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBroadcastShape(
+          operand, /*broadcast_sizes=*/{2, Shape::kUnboundedSize, 4});
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Non-broadcast dimensions must not be dynamic."));
 }
 
 TEST_P(UnboundedClampOpShapeInferenceTest, UnboundedClamp) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam()[1]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape(GetParam()[2]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[3]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[4]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[4]);
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedClampWithTuple) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("(f32[?], f32[2])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("(f32[?], f32[2])"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, lhs, rhs, ehs);
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr(
           "Expected array argument for clamp min, but got (f32[2], f32[?])."));
 }
 
 TEST_P(UnboundedCompareOpShapeInferenceTest, UnboundedCompare) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kCompare, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kCompare, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_P(UnboundedConcatenateOpShapeInferenceTest, UnboundedConcatenate) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape(GetParam()[1]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape(GetParam()[1]));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2},
                                          /*dimension=*/0);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[2]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[2]));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[3]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[3]);
   }
 }
 
 TEST_F(UnboundedConcatenateOpShapeInferenceTest,
        UnboundedConcatenateMismatchedDimensions) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape("f32[2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3, ParseShape("f32[2, 4]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, 4]"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Mismatched dimension sizes 3 and 4 in dimension 1"));
 }
 
 TEST_F(UnboundedConcatenateOpShapeInferenceTest,
        UnboundedConcatenateMismatchedBoundSizes) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand1, ParseShape("f32[2, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand2, ParseShape("f32[2, <=3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand3, ParseShape("f32[2, <=4]"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand1, ParseShape("f32[2, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand2, ParseShape("f32[2, <=3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand3, ParseShape("f32[2, <=4]"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferConcatOpShape({&operand1, &operand2, &operand3},
                                          /*dimension=*/0);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Mismatched bound sizes 3 and 4 in dimension 1"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedConvert) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f64[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result, ShapeInference::InferConvertShape(
-                                            operand, PrimitiveType::F64));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f64[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape result, ShapeInference::InferConvertShape(
+                                                  operand, PrimitiveType::F64));
   EXPECT_TRUE(ShapeUtil::Equal(result, expected))
       << "inferred: " << ShapeUtil::HumanString(result)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedConvolution) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 1, ?, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 1, ?, 8]"));
 
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
@@ -4153,62 +4306,64 @@ TEST_F(ShapeInferenceTest, UnboundedConvolution) {
   dnums.set_kernel_output_feature_dimension(3);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Window window,
+      const Window window,
       ShapeInference::InferWindowFromDimensions(
           /*window_dimensions=*/{2, 2}, /*window_strides=*/{1, 1},
           MakePadding(/*input_dimensions=*/{2, Shape::kUnboundedSize},
                       /*window_dimensions=*/{2, 2},
                       /*window_strides=*/{1, 1}, Padding::kValid),
           /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferConvolveShape(
                               lhs, rhs, /*feature_group_count=*/1,
                               /*batch_group_count=*/1, window, dnums,
                               /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedDiv) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kDivide, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kDivide, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedDot) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 10]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_contracting_dimensions(1);
   dnums.add_rhs_contracting_dimensions(0);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 4, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, <=3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("f32[?, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("f32[2, 4, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, <=3, 5]"));
 
   DotDimensionNumbers dnums;
   dnums.add_lhs_batch_dimensions(0);
@@ -4217,18 +4372,19 @@ TEST_F(ShapeInferenceTest, UnboundedDotGeneral) {
   dnums.add_rhs_contracting_dimensions(1);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferDotOpShape(lhs, rhs, dnums,
                                       /*preferred_element_type=*/std::nullopt));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedGather) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[3, 4, 2]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape start_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, 2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[3, 4, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape start_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, 2, 2]"));
 
   GatherDimensionNumbers dimension_numbers;
   dimension_numbers.add_offset_dims(2);
@@ -4238,123 +4394,148 @@ TEST_F(ShapeInferenceTest, UnboundedGather) {
   dimension_numbers.add_start_index_map(0);
   dimension_numbers.set_index_vector_dim(2);
 
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferGatherShape(
                               operand, start_indices, dimension_numbers,
                               /*slice_sizes=*/{1, 2, 2}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMax) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kMaximum, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kMaximum, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedMul) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kMultiply, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kMultiply, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
+                HasSubstr(*GetParam().error_message));
+  }
+}
+
+TEST_P(UnboundedLogicalOpShapeInferenceTest, UnboundedOr) {
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kOr, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
+        << " expected: " << ShapeUtil::HumanString(expected);
+  } else {
+    ASSERT_TRUE(GetParam().error_message.has_value());
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedPad) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, 10]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape padding_value, ParseShape("f32[]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 21]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, 10]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape padding_value, ParseShape("f32[]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 21]"));
 
   PaddingConfig padding_config;
   for (int i = 0; i < 2; i++) {
-    auto dimension = padding_config.add_dimensions();
+    const auto dimension = padding_config.add_dimensions();
     dimension->set_edge_padding_low(1);
     dimension->set_edge_padding_high(1);
     dimension->set_interior_padding(1);
   }
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferPadShape(operand, padding_value, padding_config));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedPow) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kPower, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kPower, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduce) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[7, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[7, ?]"));
 
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferReduceShape(
           {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply));
-  Shape shape = ShapeUtil::MakeShape(F32, {7});
-  Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  const Shape shape = ShapeUtil::MakeShape(F32, {7});
+  const Shape expected = ShapeUtil::MakeTupleShape({shape, shape, shape});
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduceInvalidReduceDimension) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input0, ParseShape("f32[7, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input1, ParseShape("f32[?, 5]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape input2, ParseShape("f32[5, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input0, ParseShape("f32[7, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input1, ParseShape("f32[?, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input2, ParseShape("f32[5, ?]"));
 
   ProgramShape to_apply = ShapeUtil::MakeProgramShape(
       {f32_, f32_, f32_, f32_, f32_, f32_},
       ShapeUtil::MakeTupleShape({f32_, f32_, f32_}));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferReduceShape(
+  const absl::StatusOr<Shape> inferred_shape = ShapeInference::InferReduceShape(
       {&input0, &input1, &input2, &f32_, &f32_, &f32_}, {1}, to_apply);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("All reduced tensors must have compatible dimension"));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, 4, 8]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, 3, 5]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, 4, 8]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, 3, 5]"));
 
   Window window;
   WindowDimension dim0, dim1, dim2;
@@ -4372,108 +4553,114 @@ TEST_F(ShapeInferenceTest, UnboundedReduceWindow) {
   *window.add_dimensions() = dim2;
 
   ProgramShape body = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
-  TF_ASSERT_OK_AND_ASSIGN(Shape infered_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferReduceWindowShape(
                               input, /*init_value=*/f32_, window, body));
-  EXPECT_TRUE(ShapeUtil::Equal(infered_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(infered_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshape) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape inferred, ShapeInference::InferReshapeShape(
-                                              operand, /*dimensions=*/{0},
-                                              /*new_sizes=*/{2, 3}, -1));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[2,3]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred,
+      ShapeInference::InferReshapeShape(operand, /*dimensions=*/{0},
+                                        /*new_sizes=*/{2, 3}, -1));
   ASSERT_TRUE(ShapeUtil::Equal(inferred, expected))
       << "inferred: " << ShapeUtil::HumanString(inferred)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedOutputShape) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[6]"));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferReshapeShape(
-      operand, /*dimensions=*/{0},
-      /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize}, -1);
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[6]"));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferReshapeShape(
+          operand, /*dimensions=*/{0},
+          /*new_sizes=*/{Shape::kUnboundedSize, Shape::kUnboundedSize}, -1);
   EXPECT_THAT(
-      inferred_status.status().message(),
+      inferred_shape.status().message(),
       HasSubstr("Reshaping with unbounded result shape is not supported."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedReshapeUnsupportedMixOfDynamism) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?, <=3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=3]"));
-  auto inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?, <=3]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[<=3]"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferReshapeShape(operand, /*dimensions=*/{0},
                                         /*new_sizes=*/{3}, -1);
-  ASSERT_THAT(inferred_status.status().message(),
+  ASSERT_THAT(inferred_shape.status().message(),
               HasSubstr("Reshape operand with bounded and unbounded dynamism "
                         "not supported."));
 }
 
 TEST_P(UnboundedSelectOpShapeInferenceTest, UnboundedSelect) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam()[1]));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape(GetParam()[2]));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam()[0]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam()[1]));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape(GetParam()[2]));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam()[3]));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape(GetParam()[3]));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
-    EXPECT_EQ(inferred_status.status().message(), GetParam()[4]);
+    EXPECT_EQ(inferred_shape.status().message(), GetParam()[4]);
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedSelectWithTupleUnsupported) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("(pred[2], pred[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("(f32[?], f32[2])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("(f32[2], f32[?])"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("(f32[?], f32[2])"));
-  absl::StatusOr<Shape> inferred_status =
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape("(pred[2], pred[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape("(f32[?], f32[2])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape ehs, ParseShape("(f32[2], f32[?])"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("(f32[?], f32[2])"));
+  const absl::StatusOr<Shape> inferred_shape =
       ShapeInference::InferTernaryOpShape(HloOpcode::kSelect, lhs, rhs, ehs);
-  EXPECT_THAT(inferred_status.status().message(),
+  EXPECT_THAT(inferred_shape.status().message(),
               HasSubstr("Expected array argument for select pred, but got "
                         "(pred[2], pred[?])."));
 }
 
 TEST_F(ShapeInferenceTest, UnboundedSlice) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[1, <=3, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[1, <=2, 3]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape, ShapeInference::InferSliceShape(
-                                                  operand, /*starts=*/{0, 1, 2},
-                                                  /*limits=*/{1, 3, 5},
-                                                  /*strides=*/{1, 1, 1}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[1, <=3, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[1, <=2, 3]"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Shape inferred_shape,
+      ShapeInference::InferSliceShape(operand, /*starts=*/{0, 1, 2},
+                                      /*limits=*/{1, 3, 5},
+                                      /*strides=*/{1, 1, 1}));
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape(GetParam().lhs));
-  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape(GetParam().rhs));
-  absl::StatusOr<Shape> inferred_status = ShapeInference::InferBinaryOpShape(
-      HloOpcode::kSubtract, lhs, rhs, GetParam().broadcast_dimensions);
-  if (inferred_status.ok()) {
-    TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape(GetParam().expected));
-    EXPECT_TRUE(ShapeUtil::Equal(*inferred_status, expected))
-        << "inferred: " << ShapeUtil::HumanString(*inferred_status)
+  TF_ASSERT_OK_AND_ASSIGN(const Shape lhs, ParseShape(GetParam().lhs));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape rhs, ParseShape(GetParam().rhs));
+  const absl::StatusOr<Shape> inferred_shape =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kSubtract, lhs, rhs,
+                                         GetParam().broadcast_dimensions);
+  if (inferred_shape.ok()) {
+    TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
+                            ParseShape(GetParam().expected));
+    EXPECT_TRUE(ShapeUtil::Equal(*inferred_shape, expected))
+        << "inferred: " << ShapeUtil::HumanString(*inferred_shape)
         << " expected: " << ShapeUtil::HumanString(expected);
   } else {
     ASSERT_TRUE(GetParam().error_message.has_value());
-    EXPECT_THAT(inferred_status.status().message(),
+    EXPECT_THAT(inferred_shape.status().message(),
                 HasSubstr(*GetParam().error_message));
   }
 }
 
 TEST_F(ShapeInferenceTest, UnboundedScatter) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape scatter_indices,
+                          ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?, ?, ?]"));
 
   const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
 
@@ -4486,7 +4673,7 @@ TEST_F(ShapeInferenceTest, UnboundedScatter) {
   dimension_numbers.set_index_vector_dim(2);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result,
+      const Shape result,
       ShapeInference::InferScatterShape({&input, &scatter_indices, &updates},
                                         to_apply, dimension_numbers));
   EXPECT_TRUE(ShapeUtil::Equal(result, expected))
@@ -4495,30 +4682,31 @@ TEST_F(ShapeInferenceTest, UnboundedScatter) {
 }
 
 TEST_F(ShapeInferenceTest, UnboundedTranspose) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected,
                           ParseShape("f32[<=2, 1, ?, 2, ?]{0,2,3,4,1}"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape result_shape,
+  TF_ASSERT_OK_AND_ASSIGN(const Shape inferred_shape,
                           ShapeInference::InferTransposeShape(
                               operand, /*dimensions=*/{4, 0, 3, 2, 1}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
 TEST_F(ShapeInferenceTest, UnboundedTransposeRank1) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
-  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(const Shape expected, ParseShape("f32[?]"));
   TF_ASSERT_OK_AND_ASSIGN(
-      Shape result_shape,
+      const Shape inferred_shape,
       ShapeInference::InferTransposeShape(operand, /*dimensions=*/{0}));
-  EXPECT_TRUE(ShapeUtil::Equal(result_shape, expected))
-      << "inferred: " << ShapeUtil::HumanString(result_shape)
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_shape, expected))
+      << "inferred: " << ShapeUtil::HumanString(inferred_shape)
       << " expected: " << ShapeUtil::HumanString(expected);
 }
 
-INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedAndOpShapeInferenceTest,
+INSTANTIATE_TEST_SUITE_P(UnboundedDynamism,
+                         UnboundedLogicalOpShapeInferenceTest,
                          ::testing::ValuesIn<BinaryOpTestCase>(
                              {// LHS | RHS | bdims | Res
                               // 1   | ?   | []    | ?
diff --git a/third_party/xla/xla/service/sharding_propagation_test.cc b/third_party/xla/xla/service/sharding_propagation_test.cc
index a078f630b1df9c..f4a97ff8c1dd57 100644
--- a/third_party/xla/xla/service/sharding_propagation_test.cc
+++ b/third_party/xla/xla/service/sharding_propagation_test.cc
@@ -9306,6 +9306,7 @@ ENTRY %entry {
       HloConstantSplitter(/*split_expressions=*/true).Run(module.get()));
   EXPECT_TRUE(is_split);
   TF_ASSERT_OK_AND_ASSIGN(auto _, HloDCE().Run(module.get()));
+  (void)_;  // Suppress unused variable warning in OSS
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       ShardingPropagation(/*is_spmd=*/true, /*propagate_metadata=*/true)
diff --git a/third_party/xla/xla/service/space_to_batch_converter.cc b/third_party/xla/xla/service/space_to_batch_converter.cc
index 890d734f25e260..f5015217115b5e 100644
--- a/third_party/xla/xla/service/space_to_batch_converter.cc
+++ b/third_party/xla/xla/service/space_to_batch_converter.cc
@@ -895,6 +895,7 @@ absl::StatusOr<bool> ConvolutionVisitor::Run() {
     }
     if (convs_to_visit_.count(conv) > 0) {
       TF_CHECK_OK(PerformSpaceToBatchOnConvolution(conv));
+      changed_ = true;
     }
   }
   conv_visitor_list_.clear();
@@ -3709,8 +3710,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   }
   VLOG(1) << "Handling conv " << convolution->ToString();
 
-  changed_ = false;
-
   ConvolutionDimensionNumbers dim_numbers =
       convolution->convolution_dimension_numbers();
 
@@ -3915,7 +3914,6 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   }
   TF_CHECK_OK(PropagateOnUsers(original_conv));
 
-  changed_ = true;
 
   return OkStatus();
 }
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 2e173e2cda1133..93c0c84c6f0944 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -1,7 +1,7 @@
 # Description: SPMD partitioning pass.
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 2bea346dbf9f0c..ae59b9f8737220 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -901,6 +901,7 @@ absl::StatusOr<std::pair<int64_t, int64_t>> EvaluatePartitionCost(
   HloDCE hlo_dce;
   TF_ASSIGN_OR_RETURN(
       auto _, hlo_dce.Run(&fake_module, partitioner->execution_threads()));
+  (void)_;  // Suppress unused variable warning in OSS
   VLOG(5) << "Dry-run partitioning for op: " << original_hlo->ToString() << "\n"
           << fake_module.ToString();
 
diff --git a/third_party/xla/xla/service/stream_pool.cc b/third_party/xla/xla/service/stream_pool.cc
index 17f455869aee0f..54f5c773e76138 100644
--- a/third_party/xla/xla/service/stream_pool.cc
+++ b/third_party/xla/xla/service/stream_pool.cc
@@ -22,8 +22,7 @@ limitations under the License.
 
 namespace xla {
 
-StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor,
-                                         se::StreamPriority priority) {
+StreamPool::Ptr StreamPool::BorrowStream(se::StreamPriority priority) {
   std::unique_ptr<se::Stream> stream;
 
   {
@@ -51,7 +50,7 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor,
 
   if (!stream) {
     // Create a new stream.
-    stream = executor->CreateStream(priority).value();
+    stream = executor_->CreateStream(priority).value();
     VLOG(1) << absl::StrFormat("Created new stream (%p) with priority = %s",
                                stream.get(),
                                se::StreamPriorityToString(priority));
diff --git a/third_party/xla/xla/service/stream_pool.h b/third_party/xla/xla/service/stream_pool.h
index f29c97e105a9f2..1610071de6dcff 100644
--- a/third_party/xla/xla/service/stream_pool.h
+++ b/third_party/xla/xla/service/stream_pool.h
@@ -38,15 +38,14 @@ class StreamPool {
   // stream to the pool on destruction.
   using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
 
-  StreamPool() = default;
+  explicit StreamPool(se::StreamExecutor* executor) : executor_(executor) {}
 
   // Returns a pointer to a stream in the pool, creating a new stream
   // if none are available in the pool. The returned smart pointer
   // returns the stream to the pool on destruction.
   //
   // This method is thread-safe.
-  Ptr BorrowStream(se::StreamExecutor* executor,
-                   se::StreamPriority priority = se::StreamPriority::Default);
+  Ptr BorrowStream(se::StreamPriority priority = se::StreamPriority::Default);
 
  private:
   // Puts a pointer to a stream back into the pool, leaving it free
@@ -61,6 +60,7 @@ class StreamPool {
   std::unordered_map<se::StreamPriority,
                      std::vector<std::unique_ptr<se::Stream>>>
       streams_with_pri_ ABSL_GUARDED_BY(mu_);
+  se::StreamExecutor* executor_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/stream_pool_test.cc b/third_party/xla/xla/service/stream_pool_test.cc
index 551a35cef1843c..fd0a05e5d2f237 100644
--- a/third_party/xla/xla/service/stream_pool_test.cc
+++ b/third_party/xla/xla/service/stream_pool_test.cc
@@ -34,20 +34,23 @@ class StreamPoolTest : public ::testing::Test {
   }
 };
 
-TEST_F(StreamPoolTest, EmptyPool) { StreamPool pool; }
+TEST_F(StreamPoolTest, EmptyPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool(executor.get());
+}
 
 TEST_F(StreamPoolTest, OneStreamPool) {
   std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
+  StreamPool pool(executor.get());
 
   // Borrow and return a stream.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream1 = pool.BorrowStream();
   se::Stream* stream1_ptr = stream1.get();
   EXPECT_TRUE(stream1->ok());
   stream1 = nullptr;
 
   // Borrow and return another stream.
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream2 = pool.BorrowStream();
   se::Stream* stream2_ptr = stream2.get();
   EXPECT_TRUE(stream2->ok());
   stream2 = nullptr;
@@ -59,13 +62,13 @@ TEST_F(StreamPoolTest, OneStreamPool) {
 
 TEST_F(StreamPoolTest, TwoStreamPool) {
   std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
+  StreamPool pool(executor.get());
 
   // Borrow two streams.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream1 = pool.BorrowStream();
   se::Stream* stream1_ptr = stream1.get();
   EXPECT_TRUE(stream1->ok());
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream2 = pool.BorrowStream();
   se::Stream* stream2_ptr = stream2.get();
   EXPECT_TRUE(stream2->ok());
 
@@ -75,7 +78,7 @@ TEST_F(StreamPoolTest, TwoStreamPool) {
 
   // Return stream1 and borrow stream3.
   stream1 = nullptr;
-  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream3 = pool.BorrowStream();
   se::Stream* stream3_ptr = stream3.get();
   EXPECT_TRUE(stream3->ok());
 
@@ -85,7 +88,7 @@ TEST_F(StreamPoolTest, TwoStreamPool) {
 
   // Return stream2, and borrow stream4.
   stream2 = nullptr;
-  StreamPool::Ptr stream4 = pool.BorrowStream(executor.get());
+  StreamPool::Ptr stream4 = pool.BorrowStream();
   se::Stream* stream4_ptr = stream4.get();
   EXPECT_TRUE(stream4->ok());
 
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 3157b37b627443..a229be39be5ebf 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -298,6 +298,16 @@ class TransferManager {
       se::Stream* stream, absl::Span<const se::DeviceMemoryBase> elements,
       const Shape& shape, se::DeviceMemoryBase* region) = 0;
 
+  // Returns whether subbyte types (types less than 1 byte, e.g. U4) should
+  // have multiple values packed into a single byte on the device. Subbyte
+  // bytes are never packed on the host. By default, returns false, so a byte
+  // can only hold one value, but subclasses can override this.
+  //
+  // If overridden to return true, subclasses should pack and unpack in their
+  // overridden implementations of TransferLiteralToDeviceAsync and
+  // TransferLiteralFromDevice respectively.
+  virtual bool PackSubbyteTypes() const { return false; }
+
  private:
   // The mutex that guards the platform-to-transfer manager map.
   static absl::Mutex platform_transfer_manager_mutex_;
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.cc b/third_party/xla/xla/service/while_loop_constant_sinking.cc
index 2a87c3a22cb3bc..bd2cc6a87c7729 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "xla/service/while_util.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -94,6 +95,12 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
       continue;
     }
 
+    if (sink_only_scalar_constants_) {
+      if (!ShapeUtil::IsScalar(init_value.operand(index)->shape())) {
+        continue;
+      }
+    }
+
     // Sink into the while_body.
     // Should have at least one user that's not while_body_root.
     if (invariant_body_gte->user_count() > 1) {
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking.h b/third_party/xla/xla/service/while_loop_constant_sinking.h
index 7cb8405997862f..b6484e280c574b 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking.h
+++ b/third_party/xla/xla/service/while_loop_constant_sinking.h
@@ -48,8 +48,10 @@ namespace xla {
 //
 class WhileLoopConstantSinking : public HloModulePass {
  public:
-  explicit WhileLoopConstantSinking(bool sink_broadcast_of_constants = false)
-      : sink_broadcast_of_constants_(sink_broadcast_of_constants) {}
+  explicit WhileLoopConstantSinking(bool sink_broadcast_of_constants = false,
+                                    bool sink_only_scalar_constants = false)
+      : sink_broadcast_of_constants_(sink_broadcast_of_constants),
+        sink_only_scalar_constants_(sink_only_scalar_constants) {}
 
   ~WhileLoopConstantSinking() override = default;
 
@@ -66,6 +68,7 @@ class WhileLoopConstantSinking : public HloModulePass {
   StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
 
   const bool sink_broadcast_of_constants_;
+  const bool sink_only_scalar_constants_;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
index 2299546e13b0e6..3597686e9b9cce 100644
--- a/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_constant_sinking_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -56,8 +55,17 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          WhileLoopConstantSinking{}.Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      WhileLoopConstantSinking(/*sink_broadcast_of_constants=*/false,
+                               /*sink_only_scalar_constants=*/true)
+          .Run(module.get()));
+  ASSERT_FALSE(changed);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      changed, WhileLoopConstantSinking(/*sink_broadcast_of_constants=*/false,
+                                        /*sink_only_scalar_constants=*/false)
+                   .Run(module.get()));
   ASSERT_TRUE(changed);
 
   auto* while_body = module->GetComputationWithName("body");
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index c94d9ecfec26cb..fc457f290ff895 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
index 8d132cd001d5a0..f55585f90b978d 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.cc
@@ -19,13 +19,23 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/map_util.h"
 #include "xla/service/compile_time_cap.h"
 #include "xla/service/hlo_dce.h"
-#include "xla/service/tuple_util.h"
 #include "xla/service/while_loop_analysis.h"
 #include "xla/service/while_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion.h b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
index e88ffc3ddf9ac8..6cb74435d8106c 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion.h
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 #define XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/compile_time_cap.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
index 86bc35b887ad11..5a9a35e31fbe9b 100644
--- a/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/third_party/xla/xla/service/while_loop_invariant_code_motion_test.cc
@@ -15,11 +15,22 @@ limitations under the License.
 
 #include "xla/service/while_loop_invariant_code_motion.h"
 
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/literal_util.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 4aa663341e67b8..a7e5bd524d6a84 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -22,8 +22,11 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -38,8 +41,13 @@ limitations under the License.
 #include "xla/service/hlo_dce.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/while_loop_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/union_find.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/service/while_loop_simplifier.h b/third_party/xla/xla/service/while_loop_simplifier.h
index 91173c92bbca35..47b698c0d63c45 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.h
+++ b/third_party/xla/xla/service/while_loop_simplifier.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 #define XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index b3dd15f1180955..2733e1cc69d980 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
 #include "xla/service/hlo_dce.h"
@@ -30,6 +32,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
index e08364de8570c4..80e430e5df39c6 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.cc
@@ -14,8 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "xla/service/while_loop_trip_count_annotator.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/while_loop_analysis.h"
+#include "xla/statusor.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator.h b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
index 5daa8460e4d483..af39b7f0595296 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator.h
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 #define XLA_SERVICE_WHILE_LOOP_TRIP_COUNT_ANNOTATOR_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc b/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
index 26012ddbaea4cc..1b12f3178f4b09 100644
--- a/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
+++ b/third_party/xla/xla/service/while_loop_trip_count_annotator_test.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "xla/service/while_loop_trip_count_annotator.h"
 
-#include "xla/service/pattern_matcher.h"
-#include "xla/service/while_loop_simplifier.h"
-#include "xla/status_macros.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 4f96cc48ef53e1..cf260e9919e256 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -24,7 +24,6 @@ limitations under the License.
 
 #include "absl/algorithm/algorithm.h"
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -68,43 +67,6 @@ const int kUnrollTripCountThreshold = 64;
 const int kUnrollInstructionCountThreshold = 800;
 const int kUnrollExpandFactorThreshold = 10000;
 
-// The following sequence of passes are necessary to prepare loops for
-// unrolling. Failure to run these passes will prevent unroller from unrolling
-// loops that would have been otherwise unrollable.
-//
-// Instead of placing these passes in compiler, they are placed
-// here to indicate explicit dependency to these passes.
-StatusOr<bool> PrepareModuleForUnrolling(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  bool changed = false;
-  TF_ASSIGN_OR_RETURN(
-      bool applied_cse,
-      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
-  if (applied_cse) {
-    changed = true;
-    VLOG(3) << "Applied hlo cse to module " << module->name();
-  }
-
-  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
-                      TupleSimplifier{}.Run(module, execution_threads));
-  if (applied_tuple_simplifier) {
-    changed = true;
-    VLOG(3) << "Applied tuple simplifier to module " << module->name();
-  }
-
-  // We apply constant sinking to fix point.
-  HloPassFix<WhileLoopConstantSinking> constant_sinking(
-      /*sink_broadcast_of_constants=*/true);
-  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
-                      constant_sinking.Run(module, execution_threads));
-  if (applied_constant_sinking) {
-    changed = true;
-    VLOG(3) << "Applied constant sinking to module " << module->name();
-  }
-  return changed;
-}
-
 // A utility function that decides whether a loop is unrollable or not.
 std::optional<WhileLoopConfig> IsLoopUnrollable(HloInstruction* while_op) {
   CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
@@ -452,7 +414,40 @@ StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
 
 };  // namespace
 
-absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+absl::StatusOr<bool> PrepareModuleForUnrolling(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(
+      bool applied_cse,
+      HloCSE(/*is_layout_sensitive=*/true, /*only_fusion_computations=*/false,
+             /*ignore_control_dependencies=*/false, /*only_scalars=*/true)
+          .Run(module, execution_threads));
+  if (applied_cse) {
+    changed = true;
+    VLOG(3) << "Applied hlo cse to module " << module->name();
+  }
+  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
+                      TupleSimplifier{}.Run(module, execution_threads));
+  if (applied_tuple_simplifier) {
+    changed = true;
+    VLOG(3) << "Applied tuple simplifier to module " << module->name();
+  }
+
+  // We apply constant sinking to fix point.
+  HloPassFix<WhileLoopConstantSinking> constant_sinking(
+      /*sink_broadcast_of_constants=*/true,
+      /*sink_only_scalar_constants=*/true);
+  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
+                      constant_sinking.Run(module, execution_threads));
+  if (applied_constant_sinking) {
+    changed = true;
+    VLOG(3) << "Applied constant sinking to module " << module->name();
+  }
+  return changed;
+}
+
+std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Processing the while loops in the reverse topological order. If the body
@@ -463,11 +458,11 @@ absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
                     HloPredicateIsOp<HloOpcode::kWhile>);
   }
 
-  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> while_loop_configs;
+  std::vector<std::pair<HloInstruction*, WhileLoopConfig>> while_loop_configs;
   for (HloInstruction* instr : all_while_ops) {
     std::optional<WhileLoopConfig> config = IsLoopUnrollable(instr);
     if (config.has_value()) {
-      while_loop_configs[instr] = *config;
+      while_loop_configs.emplace_back(instr, config.value());
     }
   }
   return while_loop_configs;
@@ -533,8 +528,8 @@ StatusOr<bool> WhileLoopUnroller::Run(
   // Gather a preliminary vector of all the while ops that we think we can
   // unroll. We do this ahead of time so we don't have to worry about mutating
   // the lists of computations or instructions while we iterate.
-  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> unrollable_while_ops =
-      GetUnrollableLoops(module, execution_threads);
+  std::vector<std::pair<HloInstruction*, WhileLoopConfig>>
+      unrollable_while_ops = GetUnrollableLoops(module, execution_threads);
 
   VLOG(3) << "Number of while instructions in the module to unroll: "
           << unrollable_while_ops.size();
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index bd9bebaa3d16e0..024ea408141762 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -17,10 +17,11 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
 
 #include <cstdint>
-#include <optional>
+#include <utility>
+#include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -39,14 +40,22 @@ struct WhileLoopConfig {
   int64_t induction_var_idx;
 };
 
+// Runs a sequence of passes that are necessary to prepare loops for unrolling.
+// Failure to run these passes will prevent unroller from unrolling loops that
+// would have been otherwise unrollable.
+absl::StatusOr<bool> PrepareModuleForUnrolling(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads);
+
 // Returns the list of unrollable loops in the given module
-absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+
+std::vector<std::pair<HloInstruction*, WhileLoopConfig>> GetUnrollableLoops(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads);
 
-// Unrolls the given while loop with the defaul behaviour set to full unroll. If
-// wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped in
-// a loop with trip count of one.
+// Unrolls the given while loop with the default behaviour set to full unroll.
+// If wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped
+// in a loop with trip count of one.
 StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor = -1,
                       bool wrap_in_trivial_loop = false);
 
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index d8122067f30bf4..02a6db1be5fe87 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -16,11 +16,9 @@ limitations under the License.
 #include "xla/service/while_loop_unroller.h"
 
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -353,6 +351,82 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopUnroll) {
   UnrollAndCompare(MakeModuleWithSimpleLoop(/*num_iters=*/5), {}, -1, true);
 }
 
+// This test passes because we run WhileLoopConstantSinking before unrolling.
+TEST_F(WhileLoopUnrollerTest, SimpleLoopUnrollNeedPrepare) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.1), index=2
+    add = s64[] add(get-tuple-element.1, get-tuple-element.3)
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}, s64[]) tuple(add, multiply, get-tuple-element.3)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    one = s64[] constant(1)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}, s64[]) tuple(constant.3, constant.4, one)
+    while = (s64[], s32[3]{0}, s64[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT result = s32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
+}
+
+// This test passes because we run TupleSimplifier before unrolling.
+TEST_F(WhileLoopUnrollerTest, SimpleLoopUnrollNeedPrepare2) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.1), index=2
+    add = s64[] add(get-tuple-element.1, get-tuple-element.3)
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}, s64[]) tuple(add, multiply, get-tuple-element.3)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}, s64[]) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    one = s64[] constant(1)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}, s64[]) tuple(constant.3, constant.4, one)
+    gte1 = s64[] get-tuple-element(tuple.1), index=0
+    gte2 = s32[3]{0} get-tuple-element(tuple.1), index=1
+    gte3 = s64[] get-tuple-element(tuple.1), index=2
+    tuple = (s64[], s32[3]{0}, s64[]) tuple(gte1, gte2, gte3)
+    while = (s64[], s32[3]{0}, s64[]) while(tuple), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT result = s32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
+}
+
 TEST_F(WhileLoopUnrollerTest, SimpleLoopNotRoot) {
   std::string hlo_string = R"(
   HloModule SimpleLoop
@@ -456,17 +530,9 @@ TEST_F(WhileLoopUnrollerTest, GetUnrollableLoops) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  HloInstruction* while1 =
-      module->entry_computation()->GetInstructionWithName("while1");
-  HloInstruction* while2 =
-      module->entry_computation()->GetInstructionWithName("while2");
-  HloInstruction* while3 =
-      module->entry_computation()->GetInstructionWithName("while3");
-
   auto unrollable_loops = GetUnrollableLoops(module.get(), {});
-  EXPECT_TRUE(unrollable_loops.contains(while1));
-  EXPECT_TRUE(unrollable_loops.contains(while2));
-  EXPECT_FALSE(unrollable_loops.contains(while3));
+  // Only while1 and while2 are unrollable
+  EXPECT_EQ(unrollable_loops.size(), 2);
 }
 
 TEST_F(WhileLoopUnrollerTest, UnrollMutipleLoops) {
diff --git a/third_party/xla/xla/service/while_util.h b/third_party/xla/xla/service/while_util.h
index 63611b3e7eb121..2f56ea074a8781 100644
--- a/third_party/xla/xla/service/while_util.h
+++ b/third_party/xla/xla/service/while_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/call_inliner.h"
 #include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 class WhileUtil {
diff --git a/third_party/xla/xla/service/while_util_test.cc b/third_party/xla/xla/service/while_util_test.cc
index a1c4a9203dd128..a899a89a5215c3 100644
--- a/third_party/xla/xla/service/while_util_test.cc
+++ b/third_party/xla/xla/service/while_util_test.cc
@@ -18,11 +18,15 @@ limitations under the License.
 #include <memory>
 
 #include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_matchers.h"
+#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 #include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
index 00dc0743f9670e..16ae69b5bf0e18 100644
--- a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
@@ -16,11 +16,15 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/types/span.h"
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
 #include "xla/executable_run_options.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
diff --git a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
index e84a16db9813a8..7526cd401c71ce 100644
--- a/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_stablehlo_cpu_test.cc
@@ -17,10 +17,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "xla/client/client_library.h"
+#include "xla/client/executable_build_options.h"
 #include "xla/client/local_client.h"
+#include "xla/error_spec.h"
 #include "xla/executable_run_options.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/tests/literal_test_util.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
index 117a899556fb15..592ea2a9e185fb 100644
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -15,7 +15,7 @@
 version: 3
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false}}"
+  hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"],\"algorithm\":\"ALG_UNSET\"},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false},\"force_earliest_schedule\":false}"
   result {
     gemm {
       algorithm: 13
@@ -24,7 +24,7 @@ results {
 }
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0}}"
+  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[1,2,4,4]{3,2,1,0}, f32[1,2,3,2]{3,2,1,0}), window={size=3x2}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convForward\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"cudnn_conv_backend_config\":{\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0,\"leakyrelu_alpha\":0},\"force_earliest_schedule\":false}"
   result {
     run_time {
       nanos: 8192
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index 929ae0ec2888d1..b7f97fd800f8af 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -22,10 +22,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/status.h"
 #include "xla/tools/xla_compile_lib.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/types.h"
-#include "tsl/util/command_line_flags.h"
-
 
 namespace xla {
 namespace xla_compile {
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.cc b/third_party/xla/xla/service/xla_debug_info_manager.cc
index 6de403513478dc..b6d5e5ff90d135 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_proto_util.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/xla_debug_info_manager.h b/third_party/xla/xla/service/xla_debug_info_manager.h
index 4a700f5c586e53..0d3ce1ca18a42e 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager.h
+++ b/third_party/xla/xla/service/xla_debug_info_manager.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
 #include "tsl/platform/status.h"
diff --git a/third_party/xla/xla/service/xla_debug_info_manager_test.cc b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
index a3ea0ec375c5fe..f3aaa7c47a41fc 100644
--- a/third_party/xla/xla/service/xla_debug_info_manager_test.cc
+++ b/third_party/xla/xla/service/xla_debug_info_manager_test.cc
@@ -18,8 +18,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
index 9e8e323b039750..fb7ccc9a63c768 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 #include "xla/service/zero_sized_hlo_elimination.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
+#include "xla/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/status.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination.h b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
index e4296f7c0d07f0..5e908e09095362 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination.h
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 #define XLA_SERVICE_ZERO_SIZED_HLO_ELIMINATION_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/statusor.h"
 
 // HLO pass that replaces zero sized Hlos with a zero sized constant literal.
 namespace xla {
diff --git a/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc b/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
index 0137bf51d2e67b..9da305fb978cdb 100644
--- a/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/third_party/xla/xla/service/zero_sized_hlo_elimination_test.cc
@@ -20,17 +20,15 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/service/shape_inference.h"
+#include "xla/literal_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/test.h"
-#include "xla/test_helpers.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/shape.cc b/third_party/xla/xla/shape.cc
index 33a2179673a6d3..7831898b4a1c0f 100644
--- a/third_party/xla/xla/shape.cc
+++ b/third_party/xla/xla/shape.cc
@@ -200,10 +200,21 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
   }
 
   if (!ignore_dimensions_) {
-    if (!ShapeUtil::SameDimensions(lhs, rhs)) {
-      VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+    if (!ShapeUtil::SameRank(lhs, rhs)) {
+      VLOG(3) << "CompareShapes: lhs rank != rhs rank";
       return false;
     }
+    for (int i = 0; i < lhs.rank(); ++i) {
+      if (ignore_dynamic_dimension_ &&
+          (lhs.is_unbounded_dynamic_dimension(i) ||
+           rhs.is_unbounded_dynamic_dimension(i))) {
+        continue;
+      }
+      if (lhs.dimensions(i) != rhs.dimensions(i)) {
+        VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+        return false;
+      }
+    }
   } else {
     if (!ShapeUtil::SameRank(lhs, rhs)) {
       VLOG(3) << "CompareShapes: lhs rank != rhs rank";
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index adb7ad910a87aa..132c951cbeec36 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -117,7 +117,7 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
 namespace {
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
-StatusOr<Shape> MakeShapeWithLayoutInternal(
+absl::StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major,
     absl::Span<const DimLevelType> dim_level_types,
@@ -304,7 +304,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   return output;
 }
 
-/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+/* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions) {
   Shape shape;
   if (!FillNewShape(element_type, dimensions, &shape)) {
@@ -315,7 +315,7 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
   return std::move(shape);
 }
 
-/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+/* static */ absl::StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     const std::vector<bool>& dynamic_dimensions) {
   if (dynamic_dimensions.size() != dimensions.size()) {
@@ -1100,7 +1100,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
   return *return_shape;
 }
 
-/* static */ StatusOr<const Shape*> ShapeUtil::TryGetSubshape(
+/* static */ absl::StatusOr<const Shape*> ShapeUtil::TryGetSubshape(
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
@@ -1931,7 +1931,7 @@ struct ParallelState {
     auto indexes_copy = s.indexes;
     pstate.pool->Schedule([indexes_copy, &visitor_function, &pstate] {
       const int thread_id = pstate.pool->CurrentThreadId();
-      StatusOr<bool> result = visitor_function(indexes_copy, thread_id);
+      absl::StatusOr<bool> result = visitor_function(indexes_copy, thread_id);
       if (!result.ok()) {
         absl::MutexLock lock(&pstate.mu);
         if (pstate.status.ok()) {
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 61ce32ee992248..548d39961b9ce4 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -417,9 +417,9 @@ class ShapeUtil {
   // dimensions. Method checks if the element type is valid, the shape's
   // size fits in std::numeric_limits<int64_t>::max(), and dynamic size is not
   // marked static.
-  static StatusOr<Shape> MakeValidatedShape(
+  static absl::StatusOr<Shape> MakeValidatedShape(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions);
-  static StatusOr<Shape> MakeValidatedShape(
+  static absl::StatusOr<Shape> MakeValidatedShape(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       const std::vector<bool>& dynamic_dimensions);
 
@@ -554,8 +554,8 @@ class ShapeUtil {
   // Faster version for one index.
   static const Shape& GetSubshapeOneIndex(const Shape& shape, int64_t index);
 
-  static StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
-                                               ShapeIndexView index);
+  static absl::StatusOr<const Shape*> TryGetSubshape(const Shape& shape,
+                                                     ShapeIndexView index);
   static Shape* GetMutableSubshape(Shape* shape, ShapeIndexView index);
 
   // Returns whether the given index in the given shape is a leaf element of the
@@ -871,7 +871,7 @@ class ShapeUtil {
                                        const xla::Shape& bounded_shape);
 
   using ForEachVisitorFunction =
-      absl::FunctionRef<StatusOr<bool>(absl::Span<const int64_t>)>;
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>)>;
 
   using ForEachVisitorFunctionNoStatus =
       absl::FunctionRef<bool(absl::Span<const int64_t>)>;
@@ -936,12 +936,12 @@ class ShapeUtil {
   static void ForEachIndex(const Shape& shape,
                            const ForEachVisitorFunction& visitor_function) {
     ForEachIndexWithStatus(shape, [&](absl::Span<const int64_t> indices) {
-      return StatusOr<bool>(visitor_function(indices));
+      return absl::StatusOr<bool>(visitor_function(indices));
     }).IgnoreError();
   }
 
   using ForEachParallelVisitorFunction =
-      absl::FunctionRef<StatusOr<bool>(absl::Span<const int64_t>, int)>;
+      absl::FunctionRef<absl::StatusOr<bool>(absl::Span<const int64_t>, int)>;
 
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
   // the visitor_function is thread-safe and the order of iteration does not
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index f640ae028ff830..cee69daab1ee44 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -216,6 +216,9 @@ TEST(ShapeUtilTest, EqualDynamicShapes) {
   EXPECT_FALSE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
                        ShapeUtil::MakeShape(F32, {4, 3}, {false, false})));
+  EXPECT_FALSE(ShapeUtil::Equal(
+      ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize}, {true}),
+      ShapeUtil::MakeShape(F32, {2}, {true})));
 }
 
 TEST(ShapeUtilTest, CompatibleDynamicShapes) {
@@ -633,7 +636,8 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   // Increments at every invocation.
   int invocations = 0;
   auto increment_func =
-      [&invocations](absl::Span<const int64_t> indexes) -> StatusOr<bool> {
+      [&invocations](
+          absl::Span<const int64_t> indexes) -> absl::StatusOr<bool> {
     if (++invocations == 5) {
       return Unimplemented("Cannot increment beyond 5.");
     }
@@ -655,7 +659,7 @@ TEST(ShapeUtilTest, GetForEachIndexParallelThreadCount) {
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
   auto check_func = [kThreadCount](absl::Span<const int64_t> /*indexes*/,
-                                   int thread_id) -> StatusOr<bool> {
+                                   int thread_id) -> absl::StatusOr<bool> {
     EXPECT_GE(thread_id, -1);
     EXPECT_LT(thread_id, kThreadCount);
     return true;
@@ -672,7 +676,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel) {
   int64_t output[10][10];
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -690,7 +694,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_Rank0) {
   Shape shape = ShapeUtil::MakeShape(F32, {});
   int64_t output = -1;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output = indexes.size();
     return true;
   };
@@ -705,7 +709,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_Empty) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 0});
   bool called = false;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     called = true;
     return true;
   };
@@ -724,7 +728,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_DimensionPinnedWithZeros) {
   int64_t output[2][2] = {};
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -748,7 +752,7 @@ TEST(ShapeUtilTest, ForEachIndexParallel_WithSkips) {
   int64_t output[10][10] = {};
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
@@ -772,13 +776,13 @@ TEST(ShapeUtilTest, ForEachIndexParallel_CalledTwice) {
   int64_t output[10][10];
   int init = 5;
   auto set_func = [&](absl::Span<const int64_t> indexes,
-                      int /*thread_id*/) -> StatusOr<bool> {
+                      int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
     return true;
   };
   int init2 = 15;
   auto set_func2 = [&](absl::Span<const int64_t> indexes,
-                       int /*thread_id*/) -> StatusOr<bool> {
+                       int /*thread_id*/) -> absl::StatusOr<bool> {
     output[indexes[0]][indexes[1]] = init2 + indexes[0] + indexes[1];
     return true;
   };
@@ -808,8 +812,9 @@ TEST(ShapeUtilTest, ForEachIndexParallel_CalledFromMultipleThreads) {
                                  kCallingThreads);
     for (int t = 0; t < kCallingThreads; ++t) {
       pool.Schedule([&output, &kShape, t] {
-        auto set_func = [&output, t](absl::Span<const int64_t> indexes,
-                                     int /*thread_id*/) -> StatusOr<bool> {
+        auto set_func = [&output, t](
+                            absl::Span<const int64_t> indexes,
+                            int /*thread_id*/) -> absl::StatusOr<bool> {
           output[t][indexes[0]][indexes[1]] = kInit + indexes[0] + indexes[1];
           return true;
         };
@@ -972,7 +977,7 @@ TEST(ShapeUtilTest, UpdateDynamicDimensions) {
 }
 
 TEST(ShapeUtilTest, InvalidDynamicDimension) {
-  StatusOr<Shape> error_status = ShapeUtil::MakeValidatedShape(
+  absl::StatusOr<Shape> error_status = ShapeUtil::MakeValidatedShape(
       F32, {Shape::kUnboundedSize, Shape::kUnboundedSize}, {true, false});
 
   EXPECT_FALSE(error_status.ok());
@@ -1620,7 +1625,7 @@ void BM_ForEachIndex(::testing::benchmark::State& state) {
   for (auto s : state) {
     int count = 0;
     auto increment_func =
-        [&count](absl::Span<const int64_t> indexes) -> StatusOr<bool> {
+        [&count](absl::Span<const int64_t> indexes) -> absl::StatusOr<bool> {
       count++;
       return true;
     };
diff --git a/third_party/xla/xla/status_macros_test.cc b/third_party/xla/xla/status_macros_test.cc
index 80df11ee1cd3ca..fe09a008143db1 100644
--- a/third_party/xla/xla/status_macros_test.cc
+++ b/third_party/xla/xla/status_macros_test.cc
@@ -59,9 +59,9 @@ TEST(StatusMacros, RetCheckSucceeding) {
   EXPECT_IS_OK(status);
 }
 
-StatusOr<int> CreateIntSuccessfully() { return 42; }
+absl::StatusOr<int> CreateIntSuccessfully() { return 42; }
 
-StatusOr<int> CreateIntUnsuccessfully() {
+absl::StatusOr<int> CreateIntUnsuccessfully() {
   return tsl::errors::Internal("foobar");
 }
 
@@ -76,19 +76,20 @@ Status ReturnStatusError() { return (tsl::errors::Internal("foobar")); }
 
 using StatusReturningFunction = std::function<Status()>;
 
-StatusOr<int> CallStatusReturningFunction(const StatusReturningFunction& func) {
+absl::StatusOr<int> CallStatusReturningFunction(
+    const StatusReturningFunction& func) {
   TF_RETURN_IF_ERROR(func());
   return 42;
 }
 
 TEST(StatusMacros, ReturnIfErrorOnOK) {
-  StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusOK);
+  absl::StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusOK);
   EXPECT_IS_OK(rc);
   EXPECT_EQ(42, std::move(rc).value());
 }
 
 TEST(StatusMacros, ReturnIfErrorOnError) {
-  StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusError);
+  absl::StatusOr<int> rc = CallStatusReturningFunction(ReturnStatusError);
   EXPECT_FALSE(rc.ok());
   EXPECT_EQ(rc.status().code(), tsl::error::INTERNAL);
 }
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index c512c81b625c34..796e12aff476d5 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,10 +1,10 @@
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -569,6 +569,7 @@ cc_library(
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -585,7 +586,6 @@ cc_library(
         "@local_tsl//tsl/platform:stacktrace",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:env_var",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index 4e43fbec8d0c1e..6916574c646edf 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -88,11 +88,3 @@ def cuda_only_cc_library(name, tags = [], **kwargs):
         restricted_to = kwargs.get("restricted_to"),
         target_compatible_with = kwargs.get("target_compatible_with"),
     )
-
-# TODO(hebecker): Remove this once we've fixed our ARM build
-def if_google_arm_build(
-        if_true,  # @unused
-        if_false = []):
-    return select({
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index b2162e309f48b4..1403c0dca73efe 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1,16 +1,4 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "cuda_only_cc_library",
-    "stream_executor_friends",
-    "tf_additional_cuda_platform_deps",
-    "tf_additional_cudnn_plugin_copts",
-    "tf_additional_gpu_compilation_copts",
-)
 load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -25,6 +13,18 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "cuda_only_cc_library",
+    "stream_executor_friends",
+    "tf_additional_cuda_platform_deps",
+    "tf_additional_cudnn_plugin_copts",
+    "tf_additional_gpu_compilation_copts",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -396,6 +396,7 @@ cuda_only_cc_library(
         "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/platform",
         "//xla/tsl/cuda:cudnn",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -420,7 +421,6 @@ cuda_only_cc_library(
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
-        "@local_tsl//tsl/util:env_var",
     ],
     alwayslink = True,
 )
@@ -566,10 +566,7 @@ cuda_only_cc_library(
         "//xla:status_macros",
         "//xla/stream_executor:stream_executor_headers",
         "//xla/stream_executor/gpu:asm_compiler_header",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
         "//xla/stream_executor/gpu:gpu_driver_header",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
@@ -577,6 +574,7 @@ cuda_only_cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
index 2d435a74ef308e..771d570b5f3f38 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/call_once.h"
 #include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/log.h"
@@ -31,6 +30,8 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/gpu/asm_compiler.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
@@ -56,18 +57,32 @@ namespace stream_executor {
     }                                                                         \
   } while (false)
 
+static absl::StatusOr<std::string> FindNvlinkExecutable(
+    std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kMinimumNvlinkVersion{11, 8, 0};
+  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
+  static constexpr std::string_view kNvLinkBinaryName = "nvlink";
+
+  return FindCudaExecutable(kNvLinkBinaryName, preferred_cuda_dir,
+                            kMinimumNvlinkVersion, kNoExcludedVersions);
+}
+
+absl::StatusOr<ToolVersion> GetNvLinkVersion(
+    std::string_view preferred_cuda_dir) {
+  // Make sure nvlink exists and is executable.
+  TF_ASSIGN_OR_RETURN(std::string bin_path,
+                      FindNvlinkExecutable(preferred_cuda_dir));
+
+  return GetToolVersion(bin_path);
+}
+
 absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images) {
-  {
-    static absl::once_flag log_once;
-    absl::call_once(log_once,
-                    [] { LOG(INFO) << "Using nvlink for parallel linking"; });
-  }
+  LOG_FIRST_N(INFO, 1) << "Using nvlink for parallel linking";
 
-  TF_ASSIGN_OR_RETURN(
-      std::string bin_path,
-      FindCudaExecutable("nvlink", std::string(preferred_cuda_dir)));
+  TF_ASSIGN_OR_RETURN(std::string bin_path,
+                      FindNvlinkExecutable(preferred_cuda_dir));
 
   if (images.empty()) {
     return std::vector<uint8>();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index 7076e9dae0c375..1f8ce927570e47 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -69,13 +69,13 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 #include "tsl/protobuf/dnn.pb.h"
-#include "tsl/util/env_var.h"
 
 // clang-format off
 #include "third_party/gpus/cuda/include/library_types.h"
@@ -229,6 +229,19 @@ class CudnnHandle {
   cudnnHandle_t handle_;  // Not owned.
 };
 
+// RAII wrapper for temporary cuDNN handles that are used for multithreaded
+// compilation. Unlike with CudnnAccess these are not associated
+// with GPU devices and are not locked.
+class LocalCuDnnHandle {
+ public:
+  explicit LocalCuDnnHandle(cudnnHandle_t handle) : handle_(handle) {}
+  ~LocalCuDnnHandle() { cudnnDestroy(handle_); }
+  cudnnHandle_t handle() { return handle_; }
+
+ private:
+  cudnnHandle_t handle_;
+};
+
 // Major version is neither forward or backward compatible and therefore major
 // versions needs to match between source and library.
 //
@@ -288,6 +301,14 @@ class CudnnAccess {
     return CudnnHandle(executor, std::move(lock), handle_);
   }
 
+  absl::StatusOr<std::unique_ptr<LocalCuDnnHandle>> GetLocalHandle() {
+    cudnnHandle_t handle = nullptr;
+    if (cudnnCreate(&handle) != CUDNN_STATUS_SUCCESS) {
+      return absl::InternalError("Creation of local cudnn handle failed.");
+    }
+    return std::make_unique<LocalCuDnnHandle>(handle);
+  }
+
   void NotifyStreamDestroyed(Stream* stream) {
     CUstream cu_stream = AsGpuStreamValue(stream);
     absl::MutexLock lock(&mutex_);
@@ -6344,7 +6365,8 @@ CreateCudnnFlashAttentionCausalMaskTensor(
 }
 
 absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
-    CudnnHandle& cudnn, const dnn::MatmulTensorDescriptor& q_descriptor,
+    dnn::DnnSupport& dnn_support,
+    const dnn::MatmulTensorDescriptor& q_descriptor,
     const dnn::MatmulTensorDescriptor& k_descriptor,
     const dnn::MatmulTensorDescriptor& v_descriptor,
     const dnn::TensorDescriptor& o_descriptor,
@@ -6464,11 +6486,11 @@ absl::StatusOr<CudnnGraph> GetCudnnFlashAttentionOperationGraph(
   }
 
   CudnnGraph cudnnGraph(std::move(graph));
-  TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare());
+  TF_ASSIGN_OR_RETURN(bool supported, cudnnGraph.Prepare(dnn_support));
   if (!supported) {
     return absl::InternalError("cuDNN graph is not supported.");
   }
-  TF_RETURN_IF_ERROR(cudnnGraph.Build(/*plan_id=*/0));
+  TF_RETURN_IF_ERROR(cudnnGraph.Build(dnn_support, /*plan_id=*/0));
 
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "\b flash attention operation graph: " << graph;
@@ -9087,7 +9109,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
     TF_ASSIGN_OR_RETURN(
         auto graph,
         GetCudnnFlashAttentionOperationGraph(
-            cudnn, /*q_descriptor=*/bmm1_lhs_descriptor,
+            *this, /*q_descriptor=*/bmm1_lhs_descriptor,
             /*k_descriptor=*/bmm1_rhs_descriptor,
             /*v_descriptor=*/bmm2_rhs_descriptor,
             /*o_descriptor=*/output_descriptor, bias_descriptor,
@@ -10354,31 +10376,11 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-// RAII wrapper for temporary cuDNN handles that are used for multithreaded
-// compilation. Unlike with CudnnAccess these are not associated
-// with GPU devices and are not locked.
-class LocalCuDnnHandle {
- public:
-  explicit LocalCuDnnHandle(cudnnHandle_t handle) : handle_(handle) {}
-  ~LocalCuDnnHandle() { cudnnDestroy(handle_); }
-  cudnnHandle_t handle() { return handle_; }
-  static absl::StatusOr<std::unique_ptr<LocalCuDnnHandle>> create() {
-    cudnnHandle_t handle = nullptr;
-    if (cudnnCreate(&handle) != CUDNN_STATUS_SUCCESS) {
-      return absl::InternalError("Could not create cudnn handle");
-    }
-    return std::make_unique<LocalCuDnnHandle>(handle);
-  }
-
- private:
-  cudnnHandle_t handle_;
-};
-
 #if CUDNN_VERSION >= 8100
 
 absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
     absl::string_view serialized_data) const {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_->GetLocalHandle());
   cudnn_frontend::graph::Graph graph;
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph.deserialize(
       cudnn->handle(),
@@ -10387,8 +10389,9 @@ absl::StatusOr<std::unique_ptr<dnn::DnnGraph>> CudnnSupport::DeserializeGraph(
   return std::make_unique<CudnnGraph>(std::move(graph));
 }
 
-absl::StatusOr<bool> CudnnGraph::Prepare() {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+absl::StatusOr<bool> CudnnGraph::Prepare(dnn::DnnSupport& dnn_support) {
+  const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_support.cudnn_->GetLocalHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.validate());
   RETURN_IF_CUDNN_FRONTEND_ERROR(graph_.build_operation_graph(cudnn->handle()));
   RETURN_IF_CUDNN_FRONTEND_ERROR(
@@ -10400,8 +10403,10 @@ absl::StatusOr<bool> CudnnGraph::Prepare() {
   return true;
 }
 
-absl::Status CudnnGraph::Build(const int64_t plan_id) {
-  TF_ASSIGN_OR_RETURN(auto cudnn, LocalCuDnnHandle::create());
+absl::Status CudnnGraph::Build(dnn::DnnSupport& dnn_support,
+                               const int64_t plan_id) {
+  const CudnnSupport& cudnn_support = static_cast<CudnnSupport&>(dnn_support);
+  TF_ASSIGN_OR_RETURN(auto cudnn, cudnn_support.cudnn_->GetLocalHandle());
   RETURN_IF_CUDNN_FRONTEND_ERROR(
       graph_.build_plan_at_index(cudnn->handle(), plan_id));
   return absl::OkStatus();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 40484460b5a8e1..06cc5fec2dd9b4 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -60,9 +60,9 @@ class CudnnGraph : public dnn::DnnGraph {
   explicit CudnnGraph(cudnn_frontend::graph::Graph&& graph)
       : graph_(std::move(graph)) {}
   // Prepares a graph and checks whether it is generally supported.
-  absl::StatusOr<bool> Prepare() override;
+  absl::StatusOr<bool> Prepare(dnn::DnnSupport&) override;
   // Builds single plan of the graph with given ID.
-  absl::Status Build(int64_t plan_id) override;
+  absl::Status Build(dnn::DnnSupport&, int64_t plan_id) override;
   // Builds all the plans
   absl::Status Execute(Stream& stream,
                        absl::Span<DeviceMemoryBase> operands) const override;
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index dcae7b0d719cca..f2503a424d9f79 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -1250,6 +1250,8 @@ class VersionInfo {
   int patch_;
 };
 
+class DnnSupport;
+
 class DnnGraph {
  public:
   DnnGraph() = default;
@@ -1259,12 +1261,14 @@ class DnnGraph {
   // anything else unexpected),
   // false on expected ones (graph is valid but not supported),
   // true on success.
-  virtual absl::StatusOr<bool> Prepare() = 0;
-  virtual absl::Status Build(int64_t plan_id) = 0;
+  virtual absl::StatusOr<bool> Prepare(DnnSupport&) = 0;
+  virtual absl::Status Build(DnnSupport&, int64_t plan_id) = 0;
   virtual absl::Status Execute(Stream& stream,
                                absl::Span<DeviceMemoryBase> operands) const = 0;
 };
 
+using LazyDnnGraph = std::unique_ptr<DnnGraph>;
+
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index ca82bdf6c32e74..c4fed215875896 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -1,28 +1,10 @@
 # Description:
 #   GPU-platform specific StreamExecutor support code.
 
-load(
-    "//xla/tests:build_defs.bzl",
-    "xla_test",
-)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
 )
-load(
-    "//xla:xla.bzl",
-    "xla_cc_test",
-)
-load(
-    "//xla/service/gpu:build_defs.bzl",
-    "gpu_kernel_library",
-)
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "gpu_only_cc_library",
-    "if_google_arm_build",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
@@ -48,6 +30,23 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_test",
+)
+load(
+    "//xla/service/gpu:build_defs.bzl",
+    "gpu_kernel_library",
+)
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "gpu_only_cc_library",
+    "if_gpu_is_configured",
+)
+load(
+    "//xla/tests:build_defs.bzl",
+    "xla_test",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -316,47 +315,11 @@ gpu_only_cc_library(
     ],
 )
 
-gpu_only_cc_library(
-    name = "gpu_timer_kernel_header",
-    hdrs = ["gpu_timer_kernel.h"],
-)
-
-gpu_kernel_library(
-    name = "gpu_timer_kernel",
-    srcs = if_gpu_is_configured(["gpu_timer_kernel.cu.cc"]),
-    deps = [
-        ":gpu_timer_kernel_header",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
-)
-
-# TODO(hebecker): Remove this once we have fixed our ARM build
-gpu_only_cc_library(
-    name = "gpu_timer_kernel_stub",
-    srcs = [
-        "gpu_timer_kernel_stub.cc",
-    ],
-    deps = [":gpu_timer_kernel_header"],
-)
-
-# TODO(hebecker): Remove this once we have fixed our ARM build
-cc_library(
-    name = "gpu_timer_kernel_not_on_google_arm",
-    deps = if_google_arm_build(
-        [":gpu_timer_kernel_stub"],
-        [":gpu_timer_kernel"],
-    ),
-)
-
 gpu_only_cc_library(
     name = "gpu_timer_header",
     hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_executor_header",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
@@ -374,7 +337,6 @@ gpu_only_cc_library(
         ":gpu_driver_header",
         ":gpu_executor_header",
         ":gpu_stream",
-        ":gpu_timer_kernel_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
@@ -389,9 +351,7 @@ gpu_only_cc_library(
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_gpu_is_configured([
-        ":gpu_timer_kernel_not_on_google_arm",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
@@ -488,6 +448,8 @@ gpu_only_cc_library(
         "//xla/stream_executor/cuda:ptx_compiler",
         "//xla/stream_executor/cuda:ptx_compiler_support",
         "//xla/stream_executor/platform",
+        "//xla/tsl/util:env_var",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -632,6 +594,7 @@ tsl_gpu_library(
     deps = [
         ":gpu_init_impl",
         "//xla/stream_executor:stream_executor_headers",
+        "//xla/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -639,7 +602,6 @@ tsl_gpu_library(
         "@local_tsl//tsl/framework:device_id",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/util:env_var",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 54641031ee58d0..0b77fb0f2d2862 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/const_init.h"
 #include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
@@ -63,7 +64,7 @@ limitations under the License.
 namespace stream_executor {
 
 static absl::StatusOr<std::string> GetToolVersionString(
-    absl::string_view binary_path) {
+    std::string_view binary_path) {
   // If binary_path doesn't exist, then tsl::SubProcess will log a bunch of
   // error messages that have confused users in the past. Therefore we first
   // check whether the binary_path exists and error out early if not.
@@ -103,7 +104,7 @@ static absl::StatusOr<ToolVersion> GetToolVersionImpl(
   }
   static constexpr LazyRE2 kVersionRegex = {R"(\bV(\d+)\.(\d+)\.(\d+)\b)"};
   ToolVersion version{};
-  absl::string_view vmaj_str, vmin_str, vdot_str;
+  std::string_view vmaj_str, vmin_str, vdot_str;
   if (!RE2::PartialMatch(tool_version.value(), *kVersionRegex, &vmaj_str,
                          &vmin_str, &vdot_str) ||
       !absl::SimpleAtoi(vmaj_str, &version[0]) ||
@@ -134,28 +135,6 @@ absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path) {
       .first->second;
 }
 
-// Prints a warning if the ptxas at ptxas_path has known bugs.
-//
-// Only prints a warning the first time it's called for a particular value of
-// ptxas_path.
-//
-// Locks on entry.˝
-static void WarnIfBadPtxasVersion(absl::string_view ptxas_path) {
-  absl::StatusOr<std::array<int64_t, 3>> version = GetToolVersion(ptxas_path);
-  if (!version.ok()) {
-    LOG(WARNING) << "Couldn't get ptxas version : " << version.status();
-    return;
-  }
-
-  if (std::make_tuple((*version)[0], (*version)[1]) < std::make_tuple(11, 1)) {
-    LOG(ERROR) << "*** WARNING *** You are using ptxas " << (*version)[0] << "."
-               << (*version)[1] << "." << (*version)[2]
-               << ", which is older than 11.1. ptxas before 11.1 is known to "
-                  "miscompile XLA code, leading to incorrect results or "
-                  "invalid-address errors.\n";
-  }
-}
-
 absl::StatusOr<absl::Span<const uint8_t>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
@@ -201,12 +180,11 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
 }
 
 absl::StatusOr<std::string> FindCudaExecutable(
-    std::string_view binary_name, std::string_view preferred_cuda_dir) {
-#if defined(PLATFORM_WINDOWS)
-  const std::string binary_filename = std::string{binary_name} + ".exe";
-#else
-  std::string_view binary_filename = binary_name;
-#endif
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions) {
+  std::string binary_filename = std::string{binary_name};
+  tsl::io::AppendDotExeIfWindows(binary_filename);
 
   std::vector<std::string> candidates{};
 
@@ -234,18 +212,44 @@ absl::StatusOr<std::string> FindCudaExecutable(
 
   for (const auto& candidate : candidates) {
     VLOG(2) << "Looking for " << candidate;
-    if (GetToolVersion(candidate).ok()) {
-      VLOG(2) << "Using " << candidate;
-      return candidate;
+    auto candidate_version = GetToolVersion(candidate);
+    if (!candidate_version.ok()) {
+      continue;
+    }
+
+    if (candidate_version.value() < minimum_version) {
+      VLOG(2) << candidate << " with version "
+              << absl::StrJoin(minimum_version, ".") << " is too old.";
+      continue;
+    }
+
+    if (absl::c_find(excluded_versions, candidate_version.value()) !=
+        excluded_versions.end()) {
+      VLOG(2) << candidate << " has version "
+              << absl::StrJoin(candidate_version.value(), ".")
+              << " which was explicitly excluded.";
+      continue;
     }
+
+    VLOG(2) << "Using " << candidate << " with version "
+            << absl::StrJoin(candidate_version.value(), ".");
+    return candidate;
   }
 
   return absl::NotFoundError(
-      absl::StrCat("Couldn't find ", binary_name,
+      absl::StrCat("Couldn't find a suitable version of ", binary_name,
                    ". The following locations were considered: ",
                    absl::StrJoin(candidates, ", ")));
 }
 
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kNoMinimumVersion{0, 0, 0};
+  static constexpr absl::Span<const ToolVersion> kNoExcludedVersions{};
+  return FindCudaExecutable(binary_name, preferred_cuda_dir, kNoMinimumVersion,
+                            kNoExcludedVersions);
+}
+
 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
                            int cc_minor) {
   using AlreadyLoggedSetTy =
@@ -274,29 +278,28 @@ static void AppendArgsFromOptions(GpuAsmOpts options,
               options.extra_flags.end());
 }
 
-absl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
-    const std::string& preferred_cuda_dir) {
+static absl::StatusOr<std::string> FindPtxAsExecutable(
+    std::string_view preferred_cuda_dir) {
+  static constexpr ToolVersion kMinimumSupportedPtxAsVersion{11, 8, 0};
+  static constexpr ToolVersion kBuggyPtxAsVersions[] = {{12, 3, 103}};
+  static constexpr std::string_view kPtxAsBinaryName = "ptxas";
+
+  return FindCudaExecutable(kPtxAsBinaryName, preferred_cuda_dir,
+                            kMinimumSupportedPtxAsVersion, kBuggyPtxAsVersions);
+}
+
+absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
+    std::string_view preferred_cuda_dir) {
   TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindCudaExecutable("ptxas", preferred_cuda_dir));
+                      FindPtxAsExecutable(preferred_cuda_dir));
   return GetToolVersion(ptxas_path);
 }
 
 absl::StatusOr<std::vector<uint8_t>> CompileGpuAsmUsingPtxAs(
     int cc_major, int cc_minor, const char* ptx_contents, GpuAsmOpts options,
     bool cancel_if_reg_spill) {
-  TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
-                      GetAsmCompilerVersion(options.preferred_cuda_dir));
-  if (ptxas_version_tuple == std::array<int64_t, 3>{12, 3, 103}) {
-    return absl::InternalError(absl::StrFormat(
-        "ptxas %d.%d.%d has a bug that we think can affect XLA. "
-        "Please use a different version.",
-        std::get<0>(ptxas_version_tuple), std::get<1>(ptxas_version_tuple),
-        std::get<2>(ptxas_version_tuple)));
-  }
   TF_ASSIGN_OR_RETURN(std::string ptxas_path,
-                      FindCudaExecutable("ptxas", options.preferred_cuda_dir));
-
-  WarnIfBadPtxasVersion(ptxas_path);
+                      FindPtxAsExecutable(options.preferred_cuda_dir));
 
   // Write ptx into a temporary file.
   std::string ptx_path;
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index 78d785f76041b9..5933a218baca13 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -104,16 +104,25 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images);
 
+using ToolVersion = std::array<int64_t, 3>;
+absl::StatusOr<std::string> FindCudaExecutable(
+    std::string_view binary_name, std::string_view preferred_cuda_dir,
+    ToolVersion minimum_version,
+    absl::Span<const ToolVersion> excluded_versions);
+
 absl::StatusOr<std::string> FindCudaExecutable(
     std::string_view binary_name, std::string_view preferred_cuda_dir);
 
 // Runs tool --version and parses its version string.
-using ToolVersion = std::array<int64_t, 3>;
 absl::StatusOr<ToolVersion> GetToolVersion(std::string_view tool_path);
 
-// On NVIDIA GPUs, returns the CUDA toolkit version supported by the driver,
+// On NVIDIA GPUs, returns the version of the ptxas command line tool.
 absl::StatusOr<ToolVersion> GetAsmCompilerVersion(
-    const std::string& preferred_cuda_dir);
+    std::string_view preferred_cuda_dir);
+
+// On NVIDIA GPUs, returns the version of the nvlink command line tool.
+absl::StatusOr<ToolVersion> GetNvLinkVersion(
+    std::string_view preferred_cuda_dir);
 
 #if GOOGLE_CUDA
 // Maintains a cache of pointers to loaded kernels
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
index a97ebdd56b3e77..c7a7ad403e3aa7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc
@@ -32,11 +32,11 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "xla/stream_executor/gpu/gpu_init.h"  // IWYU pragma: keep
 #include "xla/stream_executor/stream_executor.h"  // IWYU pragma: keep
+#include "xla/tsl/util/env_var.h"  // IWYU pragma: keep
 #include "tsl/framework/allocator.h"
 #include "tsl/framework/device_id.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/util/env_var.h"  // IWYU pragma: keep
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
index 923805afbe0920..ecd3f40c6725c9 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.cc
@@ -51,25 +51,10 @@ absl::Duration RandomDuration() {
   return absl::Microseconds(distribution(rng));
 }
 
-bool ShouldLaunchDelayKernel() {
-#if GOOGLE_CUDA
-  // Only launch the delay kernel if CUDA_LAUNCH_BLOCKING is not set to 1.
-  static bool value = [] {
-    const char* blocking = std::getenv("CUDA_LAUNCH_BLOCKING");
-    return !blocking || std::string_view{blocking} != "1";
-  }();
-  return value;
-#elif TENSORFLOW_USE_ROCM
-  return false;
-#endif
-}
-
 }  // namespace
 
 /*deprecated*/ /*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(
     GpuStream* stream) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   GpuExecutor* parent = stream->parent();
   GpuContext* context = parent->gpu_context();
   GpuEventHandle start_event;
@@ -87,8 +72,6 @@ bool ShouldLaunchDelayKernel() {
 
 /*deprecated*/ /*static*/ absl::StatusOr<std::optional<GpuTimer>>
 GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
-  // This deprecated factory does not launch the delay kernel and may lead to
-  // reduced measurement accuracy.
   if (is_needed) {
     TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
     return {std::make_optional(std::move(t))};
@@ -96,82 +79,16 @@ GpuTimer::CreateIfNeeded(GpuStream* stream, bool is_needed) {
   return std::nullopt;
 }
 
-/*static*/ absl::StatusOr<GpuTimer::GpuSemaphore>
-GpuTimer::GpuSemaphore::Create(StreamExecutor* executor) {
-  // Allocate the value in pinned host memory that can be read from both
-  // host and device.
-  TF_ASSIGN_OR_RETURN(auto alloc,
-                      executor->HostMemoryAllocate(sizeof(GpuSemaphoreState)));
-  return GpuSemaphore{std::move(alloc)};
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<GpuTimer>
+    GpuTimer::Create(Stream* stream) {
+  return GpuTimer::Create(AsGpuStream(stream));
 }
 
-DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
-  // This assumes unified addressing, as we do not explicitly translate the
-  // host pointer into a device pointer.
-  return DeviceMemory<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
-}
-
-/*static*/ absl::StatusOr<GpuTimer> GpuTimer::Create(Stream* real_stream) {
-  StreamExecutor* executor = real_stream->parent();
-  GpuStream* stream = AsGpuStream(real_stream);
-  GpuExecutor* parent = stream->parent();
-  GpuContext* context = parent->gpu_context();
-  GpuEventHandle start_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &start_event,
-                                          GpuDriver::EventFlags::kDefault));
-  GpuEventHandle stop_event;
-  TF_RETURN_IF_ERROR(GpuDriver::InitEvent(context, &stop_event,
-                                          GpuDriver::EventFlags::kDefault));
-  CHECK(start_event != nullptr && stop_event != nullptr);
-  GpuSemaphore semaphore{};
-  if (ShouldLaunchDelayKernel()) {
-    // Check the assumption that this device supports unified addressing,
-    // otherwise skip the delay kernel
-#if GOOGLE_CUDA
-    auto attr = CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING;
-#elif TENSORFLOW_USE_ROCM
-    auto attr = hipDeviceAttributeUnifiedAddressing;
-#endif
-    TF_ASSIGN_OR_RETURN(int status, GpuDriver::GetDeviceAttribute(
-                                        attr, parent->device()));
-    if (!status) {
-      LOG(WARNING) << "Skipping the delay kernel because the device does not "
-                      "support unified addressing";
-    } else {
-      // Allocate a semaphore value that will be used to signal to the delay
-      // kernel that it may exit.
-      TF_ASSIGN_OR_RETURN(semaphore, GpuSemaphore::Create(executor));
-      *semaphore = GpuSemaphoreState::Hold;
-      // In principle the kernel could be loaded lazily and shared across
-      // multiple GpuTimer objects.
-      TF_ASSIGN_OR_RETURN(
-          auto kernel,
-          (TypedKernel<DeviceMemory<GpuSemaphoreState>,
-                       GpuSemaphoreState>::Create(executor, "DelayKernel",
-                                                  delay_kernel::kernel())));
-      // Launch a delay kernel into this stream, which will spin until
-      // GetElapsedDuration() is called, the timer is destroyed, or the timeout
-      // in the kernel is reached.
-      TF_RETURN_IF_ERROR(real_stream->ThenLaunch(
-          ThreadDim(1, 1, 1), BlockDim(1, 1, 1), kernel, semaphore.device(),
-          GpuSemaphoreState::Release));
-    }
-  }
-  // The start event goes after the delay kernel in the stream
-  TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent->gpu_context(), start_event,
-                                            stream->gpu_stream()));
-  return absl::StatusOr<GpuTimer>{absl::in_place, parent, start_event,
-                                  stop_event,     stream, std::move(semaphore)};
-}
-
-/*static*/ absl::StatusOr<std::optional<GpuTimer>> GpuTimer::CreateIfNeeded(
-    Stream* stream, bool is_needed) {
-  if (is_needed) {
-    TF_ASSIGN_OR_RETURN(GpuTimer t, GpuTimer::Create(stream));
-    return {std::make_optional(std::move(t))};
-  }
-  return std::nullopt;
+[[deprecated("So it can quietly call a deprecated method")]] /*static*/ absl::
+    StatusOr<std::optional<GpuTimer>>
+    GpuTimer::CreateIfNeeded(Stream* stream, bool is_needed) {
+  return GpuTimer::CreateIfNeeded(AsGpuStream(stream), is_needed);
 }
 
 /*static*/ void GpuTimer::ReturnRandomDurationsForTesting() {
@@ -180,17 +97,6 @@ DeviceMemory<GpuSemaphoreState> GpuTimer::GpuSemaphore::device() {
 
 GpuTimer::~GpuTimer() {
   GpuContext* context = parent_->gpu_context();
-  if (semaphore_ && !is_stopped_) {
-    // Signal the delay kernel that it can exit
-    *semaphore_ = GpuSemaphoreState::Release;
-    // Wait for the delay kernel to exit before destroying the value that it is
-    // watching.
-    absl::Status status =
-        GpuDriver::SynchronizeStream(context, stream_->gpu_stream());
-    if (!status.ok()) {
-      LOG(ERROR) << status;
-    }
-  }
   if (start_event_ != nullptr) {
     absl::Status status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
@@ -211,18 +117,6 @@ absl::StatusOr<absl::Duration> GpuTimer::GetElapsedDuration() {
   }
   TF_RETURN_IF_ERROR(GpuDriver::RecordEvent(parent_->gpu_context(), stop_event_,
                                             stream_->gpu_stream()));
-  // If we launched the delay kernel then check if it already timed out.
-  if (semaphore_) {
-    if (*semaphore_ == GpuSemaphoreState::TimedOut) {
-      // The delay kernel did not achieve the intended result.
-      LOG(ERROR) << "Delay kernel timed out: measured time has sub-optimal "
-                    "accuracy. There may be a missing warmup execution, please "
-                    "investigate in Nsight Systems.";
-    } else {
-      // Signal that the kernel can exit
-      *semaphore_ = GpuSemaphoreState::Release;
-    }
-  }
   float elapsed_milliseconds = NAN;
   if (!GpuDriver::GetEventElapsedTime(parent_->gpu_context(),
                                       &elapsed_milliseconds, start_event_,
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
index 251c77ec7ee1ea..8fd83bec6499e3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_timer.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/time/time.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 
 namespace xla {
@@ -37,29 +36,9 @@ namespace gpu {
 class GpuExecutor;
 class GpuStream;
 
-// When a timer is created it launches a delay kernel into the given stream and
-// queues a start event immediately afterwards. This delay kernel blocks
-// execution on the stream until GetElapsedDuration() is called, at which point
-// an end event is queued and the delay kernel exits. This allows the device
-// execution time of the tasks queued to the stream while the timer is active
-// to be measured more accurately.
+// Timer is started once it's created, and is stopped once read.
 class GpuTimer {
  public:
-  class GpuSemaphore {
-   public:
-    GpuSemaphore() = default;
-    static absl::StatusOr<GpuSemaphore> Create(StreamExecutor* executor);
-    explicit operator bool() const { return bool{ptr_}; }
-    GpuSemaphoreState& operator*() {
-      return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
-    }
-    DeviceMemory<GpuSemaphoreState> device();
-
-   private:
-    explicit GpuSemaphore(std::unique_ptr<HostMemoryAllocation> alloc)
-        : ptr_{std::move(alloc)} {}
-    std::unique_ptr<HostMemoryAllocation> ptr_;
-  };
   static absl::StatusOr<GpuTimer> Create(Stream* stream);
   [[deprecated("Pass Stream* not GpuStream*")]] static absl::StatusOr<GpuTimer>
   Create(GpuStream* stream);
@@ -74,20 +53,17 @@ class GpuTimer {
   CreateIfNeeded(GpuStream* stream, bool is_needed);
 
   explicit GpuTimer(GpuExecutor* parent, GpuEventHandle start_event,
-                    GpuEventHandle stop_event, GpuStream* stream,
-                    GpuSemaphore semaphore = {})
+                    GpuEventHandle stop_event, GpuStream* stream)
       : parent_(parent),
         start_event_(start_event),
         stop_event_(stop_event),
-        stream_(stream),
-        semaphore_(std::move(semaphore)) {}
+        stream_(stream) {}
 
   GpuTimer(GpuTimer&& other)
       : parent_(other.parent_),
         start_event_(std::exchange(other.start_event_, nullptr)),
         stop_event_(std::exchange(other.stop_event_, nullptr)),
-        stream_(other.stream_),
-        semaphore_(std::move(other.semaphore_)) {}
+        stream_(other.stream_) {}
 
   GpuTimer& operator=(GpuTimer&& other) {
     if (this != &other) {
@@ -95,7 +71,6 @@ class GpuTimer {
       start_event_ = std::exchange(other.start_event_, nullptr);
       stop_event_ = std::exchange(other.stop_event_, nullptr);
       stream_ = other.stream_;
-      semaphore_ = std::move(other.semaphore_);
     }
     return *this;
   }
@@ -111,7 +86,6 @@ class GpuTimer {
   GpuEventHandle start_event_ = nullptr;
   GpuEventHandle stop_event_ = nullptr;
   GpuStream* stream_;
-  GpuSemaphore semaphore_;
   bool is_stopped_ = false;
 
   GpuTimer(const GpuTimer&) = delete;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc b/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
deleted file mode 100644
index 0ce4b1d9fbb323..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_timer_kernel.cu.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "xla/stream_executor/gpu/gpu_timer_kernel.h"
-
-#include <cstddef>
-
-namespace stream_executor::gpu {
-namespace {
-// Wait for the value pointed to by `semaphore` to have value `target`, timing
-// out after approximately `APPROX_TIMEOUT_SECONDS` seconds if that value is
-// not reached. This can happen if, for example, blocking launches are enabled
-// via CUDA_LAUNCH_BLOCKING=1. It can also happen if launching a kernel after
-// this delay kernel causes synchronisation, e.g. because of lazy loading.
-__global__ void DelayKernel(volatile GpuSemaphoreState* semaphore,
-                            GpuSemaphoreState target) {
-  constexpr int64_t WAIT_CYCLES{1024};
-  constexpr int64_t TIMEOUT_CYCLES{200000000};  // 100ms at 2GHz
-  const int64_t tstart{clock64()};
-  bool target_not_reached;
-  while ((target_not_reached = (*semaphore != target)) &&
-         (clock64() - tstart) < TIMEOUT_CYCLES) {
-    int64_t elapsed{};
-    const int64_t t0{clock64()};
-    do {
-      elapsed = clock64() - t0;
-    } while (elapsed < WAIT_CYCLES);
-  }
-  if (target_not_reached) {
-    // We are exiting due to the timeout. Signal this back to the host so that
-    // we can emit a warning, as it probably indicates suboptimal usage.
-    *semaphore = GpuSemaphoreState::TimedOut;
-  }
-}
-}  // namespace
-
-namespace delay_kernel {
-void* kernel() { return reinterpret_cast<void*>(DelayKernel); }
-}  // namespace delay_kernel
-
-}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 4909b2d890dd88..04edca1c9e225a 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -1,10 +1,10 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -40,7 +40,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":host_gpu_executor",
+        ":host_executor",
         ":host_platform_id",
         "//xla/stream_executor",
         "//xla/stream_executor:platform_manager",
@@ -76,14 +76,47 @@ cc_library(
     ],
 )
 
-# TODO(22689637): Rename this target.
 cc_library(
-    name = "host_gpu_executor",
+    name = "host_kernel_c_api",
+    hdrs = ["host_kernel_c_api.h"],
+)
+
+cc_library(
+    name = "host_kernel",
+    srcs = ["host_kernel.cc"],
+    hdrs = ["host_kernel.h"],
+    deps = [
+        ":host_kernel_c_api",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream_executor_internal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "host_kernel_test",
+    srcs = ["host_kernel_test.cc"],
+    deps = [
+        ":host_kernel",
+        ":host_kernel_c_api",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_library(
+    name = "host_executor",
     srcs = [
-        "host_gpu_executor.cc",
+        "host_executor.cc",
     ],
     hdrs = [
-        "host_gpu_executor.h",
+        "host_executor.h",
     ],
     deps = [
         ":host_stream",
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
similarity index 99%
rename from third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
rename to third_party/xla/xla/stream_executor/host/host_executor.cc
index 13f8ec2e0bca9c..33d54a16f3fee1 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Implementation of HostExecutor class [of those methods not defined in the
 // class declaration].
-#include "xla/stream_executor/host/host_gpu_executor.h"
+#include "xla/stream_executor/host/host_executor.h"
 
 #include <stdint.h>
 #include <string.h>
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
similarity index 97%
rename from third_party/xla/xla/stream_executor/host/host_gpu_executor.h
rename to third_party/xla/xla/stream_executor/host/host_executor.h
index 0c86d7080c8755..6123e227591fa8 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Declares the HostExecutor class, which is a CPU-only implementation of
 // the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
-#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
-#define XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -147,4 +147,4 @@ class HostExecutor : public internal::StreamExecutorInterface {
 }  // namespace host
 }  // namespace stream_executor
 
-#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_GPU_EXECUTOR_H_
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.cc b/third_party/xla/xla/stream_executor/host/host_kernel.cc
new file mode 100644
index 00000000000000..e2d37085a1c756
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.cc
@@ -0,0 +1,68 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/host/host_kernel.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace stream_executor::host {
+
+HostKernel::HostKernel(unsigned arity, SE_HOST_Kernel* kernel)
+    : arity_(arity), kernel_(kernel) {}
+
+absl::Status HostKernel::Launch(const ThreadDim& thread_dims,
+                                absl::Span<const DeviceMemoryBase> buffers) {
+  SE_HOST_KernelThreadDim kernel_thread_dims = {thread_dims.x, thread_dims.y,
+                                                thread_dims.z};
+
+  // Convert buffers to kernel arguments.
+  std::vector<SE_HOST_KernelArg> args(buffers.size());
+  for (int32_t i = 0; i < buffers.size(); ++i) {
+    args[i].data = const_cast<void*>(buffers[i].opaque());
+    args[i].size = buffers[i].size();
+  }
+
+  // TODO(b/331430625): We should be using thread pool to call kernel function
+  // for different threads (blocks) concurrently. For now it's the most trivial
+  // implementation that runs tasks sequentially.
+
+  for (uint64_t z = 0; z < thread_dims.z; ++z) {
+    for (uint64_t y = 0; y < thread_dims.y; ++y) {
+      for (uint64_t x = 0; x < thread_dims.x; ++x) {
+        SE_HOST_KernelThread kernel_thread = {x, y, z};
+
+        SE_HOST_KernelCallFrame call_frame = {
+            &kernel_thread_dims, &kernel_thread, args.size(), args.data()};
+
+        SE_HOST_KernelError* error = (*kernel_)(&call_frame);
+
+        if (error != nullptr) {
+          return absl::InternalError("Failed to call host kernel");
+        }
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace stream_executor::host
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel.h b/third_party/xla/xla/stream_executor/host/host_kernel.h
new file mode 100644
index 00000000000000..ee8f67738bf08c
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+
+namespace stream_executor::host {
+
+class HostKernel : public Kernel {
+ public:
+  HostKernel(unsigned arity, SE_HOST_Kernel* kernel);
+
+  // TODO(b/331430625): Connect this API to Launch API defined at StreamExecutor
+  // level, which requires refactoring how arguments passed to kernels, as
+  // current KernelArgs structure tied to the GPU kernel ABI.
+  absl::Status Launch(const ThreadDim& thread_dims,
+                      absl::Span<const DeviceMemoryBase> buffers);
+
+  // For host platform, we assume that a core is a thread, and we can run at
+  // most one instance of a kernel on a given thread.
+  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(ThreadDim,
+                                                      size_t) const override {
+    return 1;
+  };
+
+  unsigned Arity() const override { return arity_; };
+
+ private:
+  unsigned arity_;
+  SE_HOST_Kernel* kernel_ = nullptr;
+};
+
+}  // namespace stream_executor::host
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
new file mode 100644
index 00000000000000..6768706abc2800
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_c_api.h
@@ -0,0 +1,88 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+#define XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+//===----------------------------------------------------------------------===//
+// StreamExecutor Host Kernel API
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// StreamExecutor host kernel API is an integration point between a codegen
+// backend and a runtime. XLA:CPU backend compiles fusion regions to native
+// functions (via LLVM backend) that are compatible with a kernel API (and ABI),
+// and the runtime is simply invoking them with user buffers and orchestrates
+// multi-threaded execution.
+
+// WARNING: This API does not provide any backward compatibility guarantees as
+// today XLA:CPU backend is statically linked and we do not plan to load
+// kernels from dynamic libraries. It's defined as C API because we have to
+// match it in the codegen backend (built on top of LLVM) and C structs have
+// trivial layout that can be expressed as llvm stuct (*).
+//
+// (*) https://llvm.org/docs/LangRef.html#structure-types
+
+// Similar to a Gpu backend an XLA:CPU compiler generates a tiled function from
+// an HLO fusion where each tile is responsible for computing a part of the
+// output. It's up to compiler to chose the tiling strategy, from StreamExecutor
+// perspective it's simply an iteration space where each task is independent and
+// can be executed concurrently.
+typedef struct SE_HOST_KernelDim3 {
+  uint64_t x;
+  uint64_t y;
+  uint64_t z;
+} SE_HOST_KernelDim3;
+
+// Kernel grid size roughly corresponds to a CUDA block size.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThreadDim;
+
+// Kernel grid coordinate roughly corresponds to a CUDA block, with an
+// assumption that all kernel invocations can run concurrently.
+typedef struct SE_HOST_KernelDim3 SE_HOST_KernelThread;
+
+// A CPU kernel argument that corresponds to se::DeviceMemoryBase.
+typedef struct SE_HOST_KernelArg {
+  void* data;
+  size_t size;
+} SE_HOST_KernelArg;
+
+// A CPU kernel call frame.
+typedef struct SE_HOST_KernelCallFrame {
+  SE_HOST_KernelThreadDim* thread_dims;
+  SE_HOST_KernelThread* thread;
+
+  size_t num_args;
+  SE_HOST_KernelArg* args;
+} SE_HOST_KernelCallFrame;
+
+// Error reporting for host kernels. NULL means success.
+typedef struct SE_HOST_KernelError SE_HOST_KernelError;
+
+// Host kernel API.
+typedef SE_HOST_KernelError* SE_HOST_Kernel(
+    const SE_HOST_KernelCallFrame* call_frame);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_HOST_KERNEL_C_API_H_
diff --git a/third_party/xla/xla/stream_executor/host/host_kernel_test.cc b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
new file mode 100644
index 00000000000000..6bf3439d2e95e9
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host/host_kernel_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/host/host_kernel.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/host/host_kernel_c_api.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor::host {
+
+static SE_HOST_KernelError* AddI32(const SE_HOST_KernelCallFrame* call_frame) {
+  SE_HOST_KernelArg& lhs = call_frame->args[0];
+  SE_HOST_KernelArg& rhs = call_frame->args[1];
+  SE_HOST_KernelArg& out = call_frame->args[2];
+
+  int32_t* lhs_ptr = reinterpret_cast<int32_t*>(lhs.data);
+  int32_t* rhs_ptr = reinterpret_cast<int32_t*>(rhs.data);
+  int32_t* out_ptr = reinterpret_cast<int32_t*>(out.data);
+
+  uint64_t x = call_frame->thread->x;
+  *(out_ptr + x) = *(lhs_ptr + x) + *(rhs_ptr + x);
+
+  return nullptr;
+}
+
+TEST(HostKernelTest, Addition) {
+  HostKernel kernel(/*arity=*/3, AddI32);
+
+  std::vector<int32_t> lhs = {1, 2, 3, 4};
+  std::vector<int32_t> rhs = {5, 6, 7, 8};
+  std::vector<int32_t> out = {0, 0, 0, 0};
+
+  DeviceMemoryBase lhs_mem(lhs.data(), lhs.size() * sizeof(int32_t));
+  DeviceMemoryBase rhs_mem(rhs.data(), rhs.size() * sizeof(int32_t));
+  DeviceMemoryBase out_mem(out.data(), out.size() * sizeof(int32_t));
+  std::vector<DeviceMemoryBase> args = {lhs_mem, rhs_mem, out_mem};
+
+  TF_ASSERT_OK(kernel.Launch(ThreadDim(4), args));
+
+  std::vector<int32_t> expected = {6, 8, 10, 12};
+  EXPECT_EQ(out, expected);
+}
+
+}  // namespace stream_executor::host
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index 58771670d0b6a1..23112fbecd51aa 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/host/host_gpu_executor.h"
+#include "xla/stream_executor/host/host_executor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index d52d09e853b058..1b3682d9cce5eb 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,8 +1,8 @@
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index a88cc59071d099..edf0e24b31a119 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -312,6 +312,14 @@ class TypedKernel {
                                             absl::string_view kernel_name,
                                             void *symbol);
 
+  // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
+  // an LLVM IR.
+  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
+                                            absl::string_view ir,
+                                            absl::string_view entrypoint,
+                                            absl::string_view kernel_name,
+                                            absl::Span<std::string> options);
+
   TypedKernel() = default;
 
   Kernel &operator*() { return *kernel_; }
@@ -757,6 +765,17 @@ inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
   return TypedKernel<Args...>::Create(executor, loader_spec);
 }
 
+template <typename... Args>
+inline absl::StatusOr<TypedKernel<Args...>> TypedKernel<Args...>::Create(
+    StreamExecutor *executor, absl::string_view ir,
+    absl::string_view entrypoint, absl::string_view kernel_name,
+    absl::Span<std::string> options) {
+  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
+  loader_spec.AddLlvmHostKernel(ir, entrypoint, kernel_name, options);
+
+  return TypedKernel<Args...>::Create(executor, loader_spec);
+}
+
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.cc b/third_party/xla/xla/stream_executor/kernel_spec.cc
index fe29fabe52643c..5f7077e991bbbc 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.cc
+++ b/third_party/xla/xla/stream_executor/kernel_spec.cc
@@ -59,6 +59,15 @@ CudaPtxInMemory::CudaPtxInMemory(
   }
 }
 
+LlvmHostKernel::LlvmHostKernel(absl::string_view ir,
+                               absl::string_view entrypoint,
+                               absl::string_view kernel_name,
+                               absl::Span<std::string> options)
+    : KernelLoaderSpec(std::move(kernel_name)),
+      ir_(ir),
+      entrypoint_(entrypoint),
+      options_(options.cbegin(), options.cend()) {}
+
 const char *CudaPtxInMemory::default_text() const {
   if (ptx_by_compute_capability_.empty()) {
     return nullptr;
@@ -84,7 +93,7 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddInProcessSymbol(
     void *symbol, absl::string_view kernel_name) {
   CHECK(in_process_symbol_ == nullptr);
   in_process_symbol_ =
-      std::make_unique<InProcessSymbol>(symbol, std::string(kernel_name));
+      std::make_shared<InProcessSymbol>(symbol, std::string(kernel_name));
   return this;
 }
 
@@ -102,6 +111,15 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
   return this;
 }
 
+MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddLlvmHostKernel(
+    absl::string_view ir, absl::string_view entrypoint,
+    absl::string_view kernel_name, absl::Span<std::string> options) {
+  CHECK(llvm_host_kernel_ == nullptr);
+  llvm_host_kernel_ =
+      std::make_shared<LlvmHostKernel>(ir, entrypoint, kernel_name, options);
+  return this;
+}
+
 MultiKernelLoaderSpec::MultiKernelLoaderSpec(
     size_t arity, KernelArgsPacking kernel_args_packing)
     : arity_(arity), kernel_args_packing_(std::move(kernel_args_packing)) {}
diff --git a/third_party/xla/xla/stream_executor/kernel_spec.h b/third_party/xla/xla/stream_executor/kernel_spec.h
index aa75e4a7b7454e..d50ac23713dc5e 100644
--- a/third_party/xla/xla/stream_executor/kernel_spec.h
+++ b/third_party/xla/xla/stream_executor/kernel_spec.h
@@ -175,6 +175,25 @@ class CudaCubinInMemory : public KernelLoaderSpec {
   void operator=(const CudaCubinInMemory &) = delete;
 };
 
+class LlvmHostKernel : public KernelLoaderSpec {
+ public:
+  LlvmHostKernel(absl::string_view ir, absl::string_view entrypoint,
+                 absl::string_view kernel_name,
+                 absl::Span<std::string> options);
+
+  absl::string_view ir() const { return ir_; }
+  absl::string_view entrypoint() const { return entrypoint_; }
+  absl::Span<const std::string> options() const { return options_; }
+
+ private:
+  std::string ir_;
+  std::string entrypoint_;
+  std::vector<std::string> options_;
+
+  LlvmHostKernel(const LlvmHostKernel &) = delete;
+  void operator=(const LlvmHostKernel &) = delete;
+};
+
 // Describes how to load a kernel on any subset of a number of target platforms.
 class MultiKernelLoaderSpec {
  public:
@@ -199,6 +218,7 @@ class MultiKernelLoaderSpec {
     return cuda_cubin_in_memory_ != nullptr;
   }
   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
+  bool has_llvm_host_kernel() const { return llvm_host_kernel_ != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   // Precondition: corresponding has_* is true.
@@ -214,6 +234,10 @@ class MultiKernelLoaderSpec {
     CHECK(has_cuda_ptx_in_memory());
     return *cuda_ptx_in_memory_;
   }
+  const LlvmHostKernel &llvm_host_kernel() const {
+    CHECK(has_llvm_host_kernel());
+    return *llvm_host_kernel_;
+  }
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec. Each of these should be used at most once for a
   // single MultiKernelLoaderSpec object. See file comment for example usage.
@@ -227,6 +251,10 @@ class MultiKernelLoaderSpec {
       absl::Span<const uint8_t> cubin_bytes, absl::string_view kernel_name);
   MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
                                             absl::string_view kernel_name);
+  MultiKernelLoaderSpec *AddLlvmHostKernel(absl::string_view ir,
+                                           absl::string_view entrypoint,
+                                           absl::string_view kernel_name,
+                                           absl::Span<std::string> options);
 
   const KernelArgsPacking &kernel_args_packing() const {
     return kernel_args_packing_;
@@ -239,6 +267,8 @@ class MultiKernelLoaderSpec {
       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
   std::shared_ptr<CudaPtxInMemory>
       cuda_ptx_in_memory_;  // PTX text that resides in memory.
+  std::shared_ptr<LlvmHostKernel>
+      llvm_host_kernel_;  // LLVM kernel for host execution.
 
   // Number of parameters that the kernel takes. (This is nicer to have in a
   // constexpr than having to determine it from the types via template
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index cd73a976be545b..6754de3ebe3dda 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -1,7 +1,7 @@
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/plugin_registry.cc b/third_party/xla/xla/stream_executor/plugin_registry.cc
index 7eb0aae78ad280..8e5f772dbf603f 100644
--- a/third_party/xla/xla/stream_executor/plugin_registry.cc
+++ b/third_party/xla/xla/stream_executor/plugin_registry.cc
@@ -80,17 +80,19 @@ absl::Status PluginRegistry::RegisterFactoryInternal(
 bool PluginRegistry::HasFactory(Platform::Id platform_id,
                                 PluginKind plugin_kind) const {
   auto iter = factories_.find(platform_id);
-  if (iter != factories_.end()) {
-    switch (plugin_kind) {
-      case PluginKind::kBlas:
-        return iter->second.blas.has_value();
-      case PluginKind::kDnn:
-        return iter->second.dnn.has_value();
-      case PluginKind::kFft:
-        return iter->second.fft.has_value();
-      default:
-        break;
-    }
+  if (iter == factories_.end()) {
+    return false;
+  }
+
+  switch (plugin_kind) {
+    case PluginKind::kBlas:
+      return iter->second.blas.has_value();
+    case PluginKind::kDnn:
+      return iter->second.dnn.has_value();
+    case PluginKind::kFft:
+      return iter->second.fft.has_value();
+    default:
+      break;
   }
 
   LOG(ERROR) << "Invalid plugin kind specified: "
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index e71a76b2462236..c2ad1f56fb9974 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -1,5 +1,6 @@
 # Description:
 #   ROCm-platform specific StreamExecutor support code.
+# buildifier: disable=out-of-order-load
 
 load(
     "//xla/stream_executor:build_defs.bzl",
@@ -228,7 +229,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/util:determinism_for_kernels",
+        "//xla/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
@@ -347,8 +348,8 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/util:env_var",
-        "@local_tsl//tsl/util:determinism_for_kernels",
+        "//xla/tsl/util:env_var",
+        "//xla/tsl/util:determinism_for_kernels",
     ]),
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
index a5efdfa3f03090..b4bcc7d8f3944e 100644
--- a/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/hipsparse_wrapper.h
@@ -56,8 +56,8 @@ namespace wrap {
     }                                                                          \
     static FuncPtrT LoadOrDie() {                                              \
       void* f;                                                                 \
-      auto s = tsl::Env::Default()                                             \
-          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);                  \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(),       \
+                                                         kName, &f);           \
       CHECK(s.ok()) << "could not find " << kName                              \
                     << " in miopen DSO; dlerror: " << s.message();             \
       return reinterpret_cast<FuncPtrT>(f);                                    \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index eae41d7d584851..5de18b8557094c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -40,8 +40,8 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/determinism.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/determinism.h"
 using tsl::OpDeterminismRequired;
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 90b60c0af8c9af..bee0eb58895369 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -43,12 +43,12 @@ limitations under the License.
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/util/determinism.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/hash.h"
 #include "tsl/platform/logging.h"
-#include "tsl/util/determinism.h"
-#include "tsl/util/env_var.h"
 #include "rocm/rocm_config.h"
 
 #include <hip/hip_fp16.h>
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index e7ff3bb831df8a..12d2d0a9b33c25 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -54,12 +54,12 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_internal.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/env_var.h"
 
 namespace stream_executor {
 namespace {
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index 6b79aa1b395467..4f26d6c6b4fcf5 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -1,8 +1,8 @@
 # Description: StreamExecutor Interface for TPUs
 
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index cff8cdb449149c..24709bb78ef794 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -590,13 +590,12 @@ xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config) {
       MakeSpan(c_config.allow_spmd_sharding_propagation_to_output));
   absl::Span<const int64_t> mesh_shape_span =
       MakeSpan(c_config.auto_spmd_partitioning_mesh_shape);
-  std::vector<int64_t> mesh_shape(mesh_shape_span.begin(),
-                                  mesh_shape_span.end());
-  config.set_auto_spmd_partitioning_mesh_shape(mesh_shape);
+  config.set_auto_spmd_partitioning_mesh_shape(
+      std::vector<int64_t>(mesh_shape_span.begin(), mesh_shape_span.end()));
   absl::Span<const int64_t> mesh_ids_span =
       MakeSpan(c_config.auto_spmd_partitioning_mesh_ids);
-  std::vector<int64_t> mesh_ids(mesh_ids_span.begin(), mesh_ids_span.end());
-  config.set_auto_spmd_partitioning_mesh_ids(mesh_ids);
+  config.set_auto_spmd_partitioning_mesh_ids(
+      std::vector<int64_t>(mesh_ids_span.begin(), mesh_ids_span.end()));
   if (c_config.has_static_device_assignment) {
     auto device_assignment = xla::DeviceAssignment::Deserialize(
         stream_executor::tpu::DeserializeProto<xla::DeviceAssignmentProto>(
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
index 0db1b51f91f383..80365ebb046a22 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_ops_c_api.h
@@ -635,6 +635,9 @@ typedef struct TpuEmbeddingEngine_RecvActivationsComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   XLA_Shape* deduplication_data_shape;
   TpuSerializedProto* op_sharding;
 
@@ -652,6 +655,9 @@ typedef struct
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   TpuSerializedProto* op_sharding;
   // out
   TpuSerializedProto* xla_computation;
@@ -669,6 +675,9 @@ typedef struct TpuEmbeddingEngine_SendTPUEmbeddingGradientsComputation_Params {
 
   int32_t num_inputs;
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   XLA_Shape* learning_rate_tuple_shape;
   XLA_Shape* deduplication_data_shape;
   XLA_Shape* gradient_tuple_shape;
@@ -686,6 +695,9 @@ typedef struct TpuEmbeddingEngine_DedupDataSizeComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   // out
   int32_t* num_elements;
   TF_Status* status;
@@ -699,6 +711,9 @@ typedef struct TpuEmbeddingEngine_DedupDataTupleMaskComputation_Params {
   void* priv;
 
   TpuSerializedProto tpu_embedding_config;
+  TpuSerializedProto embedding_partitions;
+  TpuSerializedProto hbm_buffers_config;
+  TpuSerializedProto tpu_topology;
   // out
   TpuSerializedProto* xla_computation;
   TF_Status* status;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
index 128d4291b2a845..e3478e3a5ff78a 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_transfer_manager_interface.h
@@ -36,6 +36,8 @@ class TpuTransferManagerInterface : public xla::TransferManager {
       std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
 
   static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
+
+  bool PackSubbyteTypes() const override { return true; }
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/test_helpers.h b/third_party/xla/xla/test_helpers.h
index 4fcfb32c0a733a..11425fd51fa35e 100644
--- a/third_party/xla/xla/test_helpers.h
+++ b/third_party/xla/xla/test_helpers.h
@@ -16,14 +16,8 @@ limitations under the License.
 #ifndef XLA_TEST_HELPERS_H_
 #define XLA_TEST_HELPERS_H_
 
-#include <list>
-#include <vector>
-
-#include "absl/strings/string_view.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
-#include "xla/types.h"
-#include "tsl/platform/protobuf.h"
-#include "tsl/platform/regexp.h"
 #include "tsl/platform/test.h"
 
 // This module contains a minimal subset of gmock functionality just
@@ -54,18 +48,18 @@ inline const Status& GetStatus(const StatusOr<T>& status) {
 // Macros for testing the results of functions that return Status or
 // StatusOr<T> (for any type T).
 #define EXPECT_IS_OK(expression) \
-  EXPECT_EQ(::tsl::OkStatus(),   \
+  EXPECT_EQ(::absl::OkStatus(),  \
             xla::testing::internal_status::GetStatus(expression))
 #define EXPECT_IS_NOT_OK(expression) \
-  EXPECT_NE(::tsl::OkStatus(),       \
+  EXPECT_NE(::absl::OkStatus(),      \
             xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_OK
 #define ASSERT_IS_OK(expression) \
-  ASSERT_EQ(::tsl::OkStatus(),   \
+  ASSERT_EQ(::absl::OkStatus(),  \
             xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_NOT_OK
 #define ASSERT_IS_NOT_OK(expression) \
-  ASSERT_NE(::tsl::OkStatus(),       \
+  ASSERT_NE(::absl::OkStatus(),      \
             xla::testing::internal_status::GetStatus(expression))
 
 #endif  // XLA_TEST_HELPERS_H_
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 1f7b298b211314..6df9425de47fba 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1,12 +1,6 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
-load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
-load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -22,6 +16,12 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:xla.bzl", "xla_cc_binary", "xla_cc_test")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("//xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -1741,6 +1741,7 @@ xla_test(
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",  # fixdeps: keep
+        "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -1752,6 +1753,8 @@ xla_test(
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -2179,29 +2182,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "compilation_cache_test",
-    srcs = ["compilation_cache_test.cc"],
-    deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
-        ":test_macros_header",
-        ":test_utils",
-        ":xla_internal_test_main",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:statusor",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/client:global_data",
-        "//xla/client:local_client",
-        "//xla/client:xla_builder",
-        "//xla/client:xla_computation",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
@@ -2871,6 +2851,7 @@ xla_test(
         "//xla:shape_util",
         "//xla:test",
         "//xla:test_helpers",
+        "//xla/hlo/utils:hlo_matchers",
         "//xla/service/cpu:onednn_util",
         "@local_tsl//tsl/platform:platform_port",
     ],
@@ -2901,6 +2882,7 @@ xla_test(
         "cpu",
     ],
     copts = tsl_copts(),
+    shard_count = 4,
     deps = [
         ":hlo_test_base",
         ":test_macros_header",
@@ -2909,6 +2891,8 @@ xla_test(
         "//xla:shape_util",
         "//xla:test",
         "//xla:test_helpers",
+        "//xla/service/cpu:onednn_util",
+        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/build_defs.bzl b/third_party/xla/xla/tests/build_defs.bzl
index 05ee999937a5d8..b1d56b72b1273d 100644
--- a/third_party/xla/xla/tests/build_defs.bzl
+++ b/third_party/xla/xla/tests/build_defs.bzl
@@ -1,15 +1,15 @@
 """Build rules for XLA testing."""
 
+load(
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 load("//xla:xla.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
 load("//xla/tests:plugin.bzl", "plugins")
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
 
 all_backends = ["cpu", "gpu"] + plugins.keys()
 
diff --git a/third_party/xla/xla/tests/compilation_cache_test.cc b/third_party/xla/xla/tests/compilation_cache_test.cc
deleted file mode 100644
index 057015d5233696..00000000000000
--- a/third_party/xla/xla/tests/compilation_cache_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <initializer_list>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "xla/client/global_data.h"
-#include "xla/client/local_client.h"
-#include "xla/client/xla_builder.h"
-#include "xla/client/xla_computation.h"
-#include "xla/literal.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class CompilationCacheTest : public ClientLibraryTestBase {
- public:
-  void ExecuteComputationR0F32(const XlaComputation& computation,
-                               absl::Span<GlobalData* const> arguments,
-                               float expected_result, bool expect_cache_hit) {
-    ExecutionProfile execution_profile;
-    Literal result =
-        client_
-            ->ExecuteAndTransfer(computation, arguments,
-                                 /*execution_options=*/&execution_options_,
-                                 &execution_profile)
-            .value();
-    EXPECT_TRUE(LiteralTestUtil::Near(
-        LiteralUtil::CreateR0<float>(expected_result), result, error_spec_));
-    EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
-  }
-
-  void ExecuteComputationR2F32(
-      const XlaComputation& computation,
-      absl::Span<GlobalData* const> arguments,
-      std::initializer_list<std::initializer_list<float>> expected_result,
-      bool expect_cache_hit) {
-    ExecutionProfile execution_profile;
-    auto data_handle = client_
-                           ->Execute(computation, arguments,
-                                     &execution_options_, &execution_profile)
-                           .value();
-    Literal result = client_->Transfer(*data_handle).value();
-    EXPECT_TRUE(LiteralTestUtil::Near(
-        LiteralUtil::CreateR2<float>(expected_result), result, error_spec_));
-    EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
-  }
-
-  ErrorSpec error_spec_{0.0001};
-};
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) {
-  XlaBuilder builder(TestName());
-  Neg(ConstantR0<float>(&builder, 42.0));
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest,
-           DISABLED_ComputationCalledWithDifferentParameters) {
-  std::unique_ptr<GlobalData> data_42 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(42.0f)).value();
-  std::unique_ptr<GlobalData> data_123 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(123.0f)).value();
-  std::unique_ptr<GlobalData> data_456 =
-      client_->TransferToServer(LiteralUtil::CreateR0<float>(456.0f)).value();
-
-  XlaBuilder builder(TestName());
-  Neg(Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {}), "param"));
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {data_123.get()}, -123.0,
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {data_456.get()}, -456.0,
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
-                          /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_MultipleComputations) {
-  XlaBuilder builder_neg(TestName() + "_neg");
-  Neg(ConstantR0<float>(&builder_neg, 42.0));
-  XlaComputation computation_neg = builder_neg.Build().value();
-
-  XlaBuilder builder_exp(TestName() + "_exp");
-  Exp(ConstantR0<float>(&builder_exp, 1.0));
-  XlaComputation computation_exp = builder_exp.Build().value();
-
-  XlaBuilder builder_add(TestName() + "_add");
-  Add(ConstantR0<float>(&builder_add, 2.0),
-      ConstantR0<float>(&builder_add, 3.0));
-  XlaComputation computation_add = builder_add.Build().value();
-
-  ExecuteComputationR0F32(computation_neg, {}, -42.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_exp, {}, 2.7182817,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_add, {}, 5.0,
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation_neg, {}, -42.0,
-                          /*expect_cache_hit=*/true);
-}
-
-// TODO(b/74197823): Disabled because there is no cache in the new design.
-XLA_TEST_F(CompilationCacheTest, DISABLED_DifferentParameterLayouts) {
-  // Create two GlobalData arrays with the same shape but different
-  // layouts. Use these arrays as parameters to a simple computation. If the
-  // layout of the array changes then computation should be recompiled (cache
-  // miss).
-  auto rowmaj_array = LiteralUtil::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({1, 0}));
-  auto rowmaj_handle = client_->TransferToServer(rowmaj_array).value();
-
-  auto colmaj_array = LiteralUtil::CreateR2WithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}));
-  auto colmaj_handle = client_->TransferToServer(colmaj_array).value();
-
-  XlaBuilder builder(TestName());
-  Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2, 2}), "param0");
-  XlaComputation computation = builder.Build().value();
-
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR2F32(computation, {rowmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/false);
-  ExecuteComputationR2F32(computation, {rowmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-  ExecuteComputationR2F32(computation, {colmaj_handle.get()},
-                          {{1.0f, 2.0f}, {3.0f, 4.0f}},
-                          /*expect_cache_hit=*/true);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/tests/custom_call_test.cc b/third_party/xla/xla/tests/custom_call_test.cc
index 071a3ee85558d8..eaa80424bbd50b 100644
--- a/third_party/xla/xla/tests/custom_call_test.cc
+++ b/third_party/xla/xla/tests/custom_call_test.cc
@@ -19,12 +19,15 @@ limitations under the License.
 
 #include "absl/base/dynamic_annotations.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/client/lib/constants.h"
 #include "xla/client/xla_builder.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
@@ -35,6 +38,7 @@ limitations under the License.
 #include "xla/tests/test_macros.h"
 #include "xla/tests/test_utils.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace {
@@ -101,67 +105,69 @@ namespace {
 using ::testing::HasSubstr;
 
 class CustomCallTest : public HloTestBase {
+ public:
+  CustomCallTest()
+      : HloTestBase(),
+        module_(CreateNewVerifiedModule()),
+        builder_(TestName()) {}
+
  protected:
+  // Call this function when builder_ is complete (i.e. when all instructions
+  // have been added). Note that module_ is empty after calling this function.
+  auto BuildAndExecute(absl::Span<Literal* const> arguments) {
+    module_->AddEntryComputation(builder_.Build());
+    return Execute(std::move(module_), arguments);
+  }
+
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
   Shape r2f32_ = ShapeUtil::MakeShape(F32, {2, 2});
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation::Builder builder_;
 };
 
 XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R0F32Add2"));
 
-  module->AddEntryComputation(builder.Build());
-
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
   Array2D<float> array(2, 2);
   array(0, 0) = 1.0f;
   array(0, 1) = 2.0f;
   array(1, 0) = 3.0f;
   array(1, 1) = 4.0f;
 
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(array)));
-  builder.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r0f32_, {constant}, "R2F32ReduceSum"));
 
-  module->AddEntryComputation(builder.Build());
-
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
-  auto input = b.AddInstruction(
+  auto input = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2FromArray2D(
           Array2D<float>{{1.0f, 2.0f}, {3.0f, 4.0f}})));
-  auto incremented = b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto incremented = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {1, 2, 2}), {input}, "Add1ToValues"));
-  auto incremented_again = b.AddInstruction(HloInstruction::CreateCustomCall(
-      ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
+  auto incremented_again =
+      builder_.AddInstruction(HloInstruction::CreateCustomCall(
+          ShapeUtil::MakeShape(F32, {1, 2, 2}), {incremented}, "Add1ToValues"));
 
   // Concatenate the values along first dim.
-  b.AddInstruction(
+  builder_.AddInstruction(
       HloInstruction::CreateConcatenate(ShapeUtil::MakeShape(F32, {2, 2, 2}),
                                         {incremented, incremented_again}, 0));
 
-  module->AddEntryComputation(b.Build());
-
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR3EqualArray3D<float>(
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
@@ -173,24 +179,22 @@ XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
     GTEST_SKIP() << "Appears to test an XLA current implementation detail";
   }
 
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
   auto input =
-      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
-  b.AddInstruction(
+      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+  builder_.AddInstruction(
       HloInstruction::CreateCustomCall(r2f32_, {input}, "Add1ToValues"));
 
-  module->AddEntryComputation(b.Build());
-  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+  module_->AddEntryComputation(builder_.Build());
+  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   // Note, the expected result is transposed! This is because the input and
   // output layouts of the custom call differ and the called function just
   // blindly adds one to each element.
-  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module_), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
@@ -198,26 +202,24 @@ XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
-  auto module = CreateNewVerifiedModule();
-  auto b = HloComputation::Builder(TestName());
-
   auto input =
-      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+      builder_.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2}, {1, 0});
-  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
-  b.AddInstruction(
+  builder_.AddInstruction(
       custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
-  module->AddEntryComputation(b.Build());
-  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
-  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+  module_->AddEntryComputation(builder_.Build());
+  ForceParameterLayout(module_.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module_.get(), LayoutUtil::MakeLayout({0, 1}));
 
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
-  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module_), {&argument}));
   LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
 }
 
@@ -237,63 +239,49 @@ XLA_TEST_F(CustomCallTest, TupleOutput) {
   Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
 
   Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
-  Literal result = ExecuteAndTransfer(std::move(module), {&arg0, &arg1});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {&arg0, &arg1}));
   EXPECT_EQ(result, expected);
 }
 
 XLA_TEST_F(CustomCallTest, ReportsSuccess) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(HloInstruction::CreateCustomCall(
+  builder_.AddInstruction(HloInstruction::CreateCustomCall(
       r0f32_, {constant}, "R0F32Add2Succeed",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  module->AddEntryComputation(builder.Build());
-
-  Literal result = ExecuteAndTransfer(std::move(module), {});
+  TF_ASSERT_OK_AND_ASSIGN(auto result, BuildAndExecute({}));
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFailure) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant = builder.AddInstruction(
+  auto constant = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(HloInstruction::CreateCustomCall(
+  builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
 
-  module->AddEntryComputation(builder.Build());
-
-  auto status = Execute(std::move(module), {}).status();
+  auto status = BuildAndExecute({}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 42.0"));
 }
 
 XLA_TEST_F(CustomCallTest, ReportsFirstFailure) {
-  auto module = CreateNewVerifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-
-  auto constant_1 = builder.AddInstruction(
+  auto constant_1 = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant_2 = builder.AddInstruction(
+  auto constant_2 = builder_.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto res_1 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_1 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_1}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  auto res_2 = builder.AddInstruction(HloInstruction::CreateCustomCall(
+  auto res_2 = builder_.AddInstruction(HloInstruction::CreateCustomCall(
       ShapeUtil::MakeShape(F32, {}), {constant_2}, "CustomCallFail",
       /*opaque=*/"", CustomCallApiVersion::API_VERSION_STATUS_RETURNING));
-  builder.AddInstruction(HloInstruction::CreateBinary(
+  builder_.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, res_1, res_2));
 
-  module->AddEntryComputation(builder.Build());
-
-  auto status = Execute(std::move(module), {}).status();
+  auto status = BuildAndExecute({}).status();
   EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Failed: 1.0"));
 }
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index 1b3ee7e074d852..e995732bee51f5 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Computationally expensive, exhaustive tests for XLA
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/tests/filecheck.cc b/third_party/xla/xla/tests/filecheck.cc
index 2031ec3d2b26ef..5ef61bf653008d 100644
--- a/third_party/xla/xla/tests/filecheck.cc
+++ b/third_party/xla/xla/tests/filecheck.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/tests/filecheck.h"
 
 #include <cstdlib>
+#include <string>
 
 #include "xla/types.h"
 #include "xla/util.h"
@@ -44,10 +45,12 @@ absl::StatusOr<bool> RunFileCheck(const std::string& input,
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
     const std::string& input, const std::string& pattern_file) {
   // Invoke FileCheck to check whether input matches `pattern`.
+  std::string binary_name = "FileCheck";
+  tsl::io::AppendDotExeIfWindows(binary_name);
   std::string file_check_path = tsl::GetDataDependencyFilepath(
       tsl::testing::kIsOpenSource
-          ? tsl::io::JoinPath("external", "llvm-project", "llvm", "FileCheck")
-          : tsl::io::JoinPath("llvm", "llvm-project", "llvm", "FileCheck"));
+          ? tsl::io::JoinPath("external", "llvm-project", "llvm", binary_name)
+          : tsl::io::JoinPath("llvm", "llvm-project", "llvm", binary_name));
 
   tsl::SubProcess file_check_process;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index f645043a846c43..06c75fec1fc0ea 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -98,6 +98,7 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
       verifier_layout_sensitive_(verifier_layout_sensitive),
       allow_mixed_precision_in_hlo_verifier_(
           allow_mixed_precision_in_hlo_verifier),
+      instruction_can_change_layout_func_(instruction_can_change_layout_func),
       test_platform_(test_platform) {
   hlo_verifier_ = std::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
@@ -128,19 +129,16 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
   return std::make_unique<VerifiedHloModule>(
       name, GetModuleConfigForTest(replica_count), verifier_layout_sensitive_,
       allow_mixed_precision_in_hlo_verifier_,
-      backend().compiler()->ShapeSizeBytesFunction());
+      backend().compiler()->ShapeSizeBytesFunction(),
+      instruction_can_change_layout_func_);
 }
 
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           int64_t replica_count,
                                           int64_t num_partitions) {
-  TF_ASSIGN_OR_RETURN(
-      auto module,
-      ParseAndReturnVerifiedModule(
-          hlo_text, GetModuleConfigForTest(replica_count, num_partitions)));
-  UpdateEntryComputationLayout(module.get());
-  return module;
+  return ParseAndReturnVerifiedModule(
+      hlo_text, GetModuleConfigForTest(replica_count, num_partitions));
 }
 
 absl::StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -149,7 +147,8 @@ HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
   auto module = std::make_unique<VerifiedHloModule>(
       TestName(), config, verifier_layout_sensitive_,
       allow_mixed_precision_in_hlo_verifier_,
-      backend().compiler()->ShapeSizeBytesFunction());
+      backend().compiler()->ShapeSizeBytesFunction(),
+      instruction_can_change_layout_func_);
   TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
   UpdateEntryComputationLayout(module.get());
   return std::move(module);
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 53acb8e69dc4aa..f8f91a8df51b96 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -413,6 +413,7 @@ class HloTestBase : public ManifestCheckingTest {
 
   bool verifier_layout_sensitive_;
   bool allow_mixed_precision_in_hlo_verifier_;
+  HloPredicate instruction_can_change_layout_func_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
diff --git a/third_party/xla/xla/tests/onednn_layer_norm_test.cc b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
index d4e22512a3a01c..beead4f62b5b49 100644
--- a/third_party/xla/xla/tests/onednn_layer_norm_test.cc
+++ b/third_party/xla/xla/tests/onednn_layer_norm_test.cc
@@ -21,136 +21,180 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class LayerNormTest : public HloTestBase {};
-
-TEST_F(LayerNormTest, SimpleTest) {
-  const char* layer_norm_module_str = R"(
-  HloModule layer_norm.test, entry_computation_layout={(f32[4,1,256]{2,1,0}, f32[1,1,256]{2,1,0}, f32[1,1,256]{2,1,0})->f32[4,1,256]{2,1,0}}
+class LayerNormTest : public HloTestBase {
+ protected:
+  const char* onednn_layer_norm_ =
+      R"(
+  ; CHECK:     custom_call_target="__onednn$layernorm",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "onednn_layer_norm_config":{
+  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )";
+  std::string common_hlo_region_ =
+      R"(
 
   region_add {
     Arg_0.7555 = f32[] parameter(0)
     Arg_1.7556 = f32[] parameter(1)
     ROOT add.7557 = f32[] add(Arg_0.7555, Arg_1.7556)
   }
+)";
 
-  ENTRY main {
-    Arg_0.1 = f32[4,1,256]{2,1,0} parameter(0), sharding={replicated}
-    Arg_0.2 = f32[1,1,256]{2,1,0} parameter(1), sharding={replicated}
-    Arg_0.3 = f32[1,1,256]{2,1,0} parameter(2), sharding={replicated}
-    reshape.9744 = f32[1,4,1,256]{3,2,1,0} reshape(Arg_0.1)
-    multiply.9743 = f32[4,1,256]{2,1,0} multiply(Arg_0.1, Arg_0.1)
-    reshape.9745 = f32[1,4,1,256]{3,2,1,0} reshape(multiply.9743)
-    concatenate.9746 = f32[2,4,1,256]{3,2,1,0} concatenate(reshape.9744, reshape.9745), dimensions={0}
-    constant.9731 = f32[] constant(0)
-    reduce.9747 = f32[2,4,1]{2,1,0} reduce(concatenate.9746, constant.9731), dimensions={3}, to_apply=region_add
-    constant.9729 = f32[] constant(256)
-    broadcast.9730 = f32[2,4,1]{2,1,0} broadcast(constant.9729), dimensions={}
-    divide.9748 = f32[2,4,1]{2,1,0} divide(reduce.9747, broadcast.9730)
-    slice.9749 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[0:1], [0:4], [0:1]}
-    reshape.9756 = f32[4,1,1]{2,1,0} reshape(slice.9749)
-    broadcast.9758 = f32[4,1,1]{2,1,0} broadcast(reshape.9756), dimensions={0,1,2}
-    reshape.9759 = f32[4,1]{1,0} reshape(broadcast.9758)
-    broadcast.9760 = f32[4,1,256]{2,1,0} broadcast(reshape.9759), dimensions={0,1}
-    subtract.9761 = f32[4,1,256]{2,1,0} subtract(Arg_0.1, broadcast.9760)
-    slice.9751 = f32[1,4,1]{2,1,0} slice(divide.9748), slice={[1:2], [0:4], [0:1]}
-    reshape.9752 = f32[4,1]{1,0} reshape(slice.9751)
-    reshape.9750 = f32[4,1]{1,0} reshape(slice.9749)
-    multiply.9753 = f32[4,1]{1,0} multiply(reshape.9750, reshape.9750)
-    subtract.9754 = f32[4,1]{1,0} subtract(reshape.9752, multiply.9753)
-    constant.9727 = f32[] constant(0)
-    broadcast.9728 = f32[4,1]{1,0} broadcast(constant.9727), dimensions={}
-    maximum.9755 = f32[4,1]{1,0} maximum(subtract.9754, broadcast.9728)
-    reshape.9757 = f32[4,1,1]{2,1,0} reshape(maximum.9755)
-    constant.9725 = f32[] constant(1e-05)
-    broadcast.9726 = f32[4,1,1]{2,1,0} broadcast(constant.9725), dimensions={}
-    add.9762 = f32[4,1,1]{2,1,0} add(reshape.9757, broadcast.9726)
-    rsqrt.9763 = f32[4,1,1]{2,1,0} rsqrt(add.9762)
-    broadcast.9764 = f32[4,1,1]{2,1,0} broadcast(rsqrt.9763), dimensions={0,1,2}
-    reshape.9765 = f32[4,1]{1,0} reshape(broadcast.9764)
-    broadcast.9766 = f32[4,1,256]{2,1,0} broadcast(reshape.9765), dimensions={0,1}
-    broadcast.9767 = f32[1,1,256]{2,1,0} broadcast(Arg_0.2), dimensions={0,1,2}
-    reshape.9768 = f32[1,256]{1,0} reshape(broadcast.9767)
-    broadcast.9769 = f32[4,1,256]{2,1,0} broadcast(reshape.9768), dimensions={1,2}
-    multiply.9770 = f32[4,1,256]{2,1,0} multiply(broadcast.9766, broadcast.9769)
-    multiply.9771 = f32[4,1,256]{2,1,0} multiply(subtract.9761, multiply.9770)
-    broadcast.9772 = f32[1,1,256]{2,1,0} broadcast(Arg_0.3), dimensions={0,1,2}
-    reshape.9773 = f32[1,256]{1,0} reshape(broadcast.9772)
-    broadcast.9774 = f32[4,1,256]{2,1,0} broadcast(reshape.9773), dimensions={1,2}
-    ROOT add.9775 = f32[4,1,256]{2,1,0} add(multiply.9771, broadcast.9774)
-  }  
+  std::string common_hlo_entry_computation_block_ =
+      R"(
+    Arg_0.2 = f32[768]{0} parameter(1), sharding={replicated}
+    Arg_0.3 = f32[768]{0} parameter(2), sharding={replicated}
+
+    convert.290 = f32[84,197,768]{2,1,0} convert(Arg_0.1)
+    constant.291 = f32[] constant(0)
+    convert.292 = f32[] convert(constant.291)
+    reduce.297 = f32[84,197]{1,0} reduce(convert.290, convert.292), dimensions={2}, to_apply=region_add
+    constant.298 = s32[] constant(768)
+    convert.299 = f32[] convert(constant.298)
+    broadcast.300 = f32[84,197]{1,0} broadcast(convert.299), dimensions={}
+    divide.301 = f32[84,197]{1,0} divide(reduce.297, broadcast.300)
+    convert.302 = f32[84,197]{1,0} convert(divide.301)
+    reshape.303 = f32[84,197,1]{2,1,0} reshape(convert.302)
+    reshape.304 = f32[84,197]{1,0} reshape(reshape.303)
+    broadcast.305 = f32[84,197,768]{2,1,0} broadcast(reshape.304), dimensions={0,1}
+    subtract.306 = f32[84,197,768]{2,1,0} subtract(Arg_0.1, broadcast.305)
+    multiply.307 = f32[84,197,768]{2,1,0} multiply(subtract.306, subtract.306)
+    convert.308 = f32[84,197,768]{2,1,0} convert(multiply.307)
+    constant.309 = f32[] constant(0)
+    convert.310 = f32[] convert(constant.309)
+    reduce.315 = f32[84,197]{1,0} reduce(convert.308, convert.310), dimensions={2}, to_apply=region_add
+    constant.316 = s32[] constant(768)
+    convert.317 = f32[] convert(constant.316)
+    broadcast.318 = f32[84,197]{1,0} broadcast(convert.317), dimensions={}
+    divide.319 = f32[84,197]{1,0} divide(reduce.315, broadcast.318)
+    convert.320 = f32[84,197]{1,0} convert(divide.319)
+    reshape.321 = f32[84,197,1]{2,1,0} reshape(convert.320)
+    constant.322 = f32[] constant(1e-12)
+    broadcast.323 = f32[84,197,1]{2,1,0} broadcast(constant.322), dimensions={}
+    add.324 = f32[84,197,1]{2,1,0} add(reshape.321, broadcast.323)
+    rsqrt.325 = f32[84,197,1]{2,1,0} rsqrt(add.324)
+    reshape.328 = f32[84,197]{1,0} reshape(rsqrt.325)
+    broadcast.329 = f32[84,197,768]{2,1,0} broadcast(reshape.328), dimensions={0,1}
+    broadcast.327 = f32[84,197,768]{2,1,0} broadcast(Arg_0.2), dimensions={2}
+    multiply.330 = f32[84,197,768]{2,1,0} multiply(broadcast.329, broadcast.327)
+    multiply.331 = f32[84,197,768]{2,1,0} multiply(Arg_0.1, multiply.330)
+    broadcast.336 = f32[84,197,768]{2,1,0} broadcast(Arg_0.3), dimensions={2}
+    reshape.332 = f32[84,197]{1,0} reshape(reshape.303)
+    broadcast.333 = f32[84,197,768]{2,1,0} broadcast(reshape.332), dimensions={0,1}
+    multiply.334 = f32[84,197,768]{2,1,0} multiply(multiply.330, broadcast.333)
+    subtract.337 = f32[84,197,768]{2,1,0} subtract(broadcast.336, multiply.334)
 )";
+};
+
+TEST_F(LayerNormTest, LayerNormTest0_FP32) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(f32[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->f32[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1 = f32[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+        
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    ROOT add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+  }
+  )";
 
   EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-4, 1e-4}));
-  MatchOptimizedHlo(layer_norm_module_str,
-                    R"(
-  ; CHECK:     custom_call_target="__onednn$layernorm",
-  ; CHECK:       backend_config={
-  ; CHECK-DAG:     "onednn_layer_norm_config":{
-  ; CHECK-DAG:       "fused_ops":"SCALE_AND_SHIFT"
-  ; CHECK-DAG:   }
-  ; CHECK:     }
-  )");
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
 }
 
-TEST_F(LayerNormTest, SimpleTestBF16) {
-  const char* layer_norm_module_str = R"(
-  HloModule layer_norm_bf16.test, entry_computation_layout={(f32[768]{0}, f32[768]{0}, bf16[16,128,768]{2,1,0})->bf16[16,128,768]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+TEST_F(LayerNormTest, LayerNormTest0_BF16) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(bf16[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->bf16[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1.0 = bf16[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+    Arg_0.1 = f32[84,197,768]{2,1,0} convert(Arg_0.1.0)
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+    ROOT convert.339 = bf16[84,197,768]{2,1,0} convert(add.338)
+  }
+  )";
 
-  region_0.16 {
-    Arg_0.17 = f32[] parameter(0)
-    Arg_1.18 = f32[] parameter(1)
-    ROOT add.19 = f32[] add(Arg_0.17, Arg_1.18)
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
+}
+
+TEST_F(LayerNormTest, LayerNormTest0_F16) {
+  std::string layer_norm_module_str =
+      R"(HloModule layer_norm.test, entry_computation_layout={(f16[84,197,768]{2,1,0}, f32[768]{0}, f32[768]{0})->f16[84,197,768]{2,1,0}})" +
+      common_hlo_region_ + R"(
+  ENTRY main {
+    Arg_0.1.0 = f16[84,197,768]{2,1,0} parameter(0), sharding={replicated}
+    Arg_0.1 = f32[84,197,768]{2,1,0} convert(Arg_0.1.0)
+  )" + common_hlo_entry_computation_block_ +
+      R"(
+    add.338 = f32[84,197,768]{2,1,0} add(multiply.331, subtract.337)
+    ROOT convert.339 = f16[84,197,768]{2,1,0} convert(add.338)
   }
+  )";
 
-  ENTRY main.53 {
-    Arg_2.3 = bf16[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.31 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
-    convert.11 = f32[16,128,768]{2,1,0} convert(Arg_2.3)
-    reshape.13 = f32[1,16,128,768]{3,2,1,0} reshape(convert.11)
-    multiply.12 = f32[16,128,768]{2,1,0} multiply(convert.11, convert.11)
-    reshape.14 = f32[1,16,128,768]{3,2,1,0} reshape(multiply.12)
-    concatenate.15 = f32[2,16,128,768]{3,2,1,0} concatenate(reshape.13, reshape.14), dimensions={0}
-    constant.10 = f32[] constant(0)
-    reduce.20 = f32[2,16,128]{2,1,0} reduce(concatenate.15, constant.10), dimensions={3}, to_apply=region_0.16
-    constant.8 = f32[] constant(768)
-    broadcast.9 = f32[2,16,128]{2,1,0} broadcast(constant.8), dimensions={}
-    divide.21 = f32[2,16,128]{2,1,0} divide(reduce.20, broadcast.9)
-    slice.22 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[0:1], [0:16], [0:128]}
-    reshape.29 = f32[16,128,1]{2,1,0} reshape(slice.22)
-    broadcast.32 = f32[16,128,1]{2,1,0} broadcast(reshape.29), dimensions={0,1,2}
-    reshape.33 = f32[16,128]{1,0} reshape(broadcast.32)
-    broadcast.34 = f32[16,128,768]{2,1,0} broadcast(reshape.33), dimensions={0,1}
-    subtract.35 = f32[16,128,768]{2,1,0} subtract(convert.31, broadcast.34)
-    slice.24 = f32[1,16,128]{2,1,0} slice(divide.21), slice={[1:2], [0:16], [0:128]}
-    reshape.25 = f32[16,128]{1,0} reshape(slice.24)
-    reshape.23 = f32[16,128]{1,0} reshape(slice.22)
-    multiply.26 = f32[16,128]{1,0} multiply(reshape.23, reshape.23)
-    subtract.27 = f32[16,128]{1,0} subtract(reshape.25, multiply.26)
-    constant.6 = f32[] constant(0)
-    broadcast.7 = f32[16,128]{1,0} broadcast(constant.6), dimensions={}
-    maximum.28 = f32[16,128]{1,0} maximum(subtract.27, broadcast.7)
-    reshape.30 = f32[16,128,1]{2,1,0} reshape(maximum.28)
-    constant.4 = f32[] constant(1e-06)
-    broadcast.5 = f32[16,128,1]{2,1,0} broadcast(constant.4), dimensions={}
-    add.36 = f32[16,128,1]{2,1,0} add(reshape.30, broadcast.5)
-    rsqrt.37 = f32[16,128,1]{2,1,0} rsqrt(add.36)
-    broadcast.39 = f32[16,128,1]{2,1,0} broadcast(rsqrt.37), dimensions={0,1,2}
-    reshape.40 = f32[16,128]{1,0} reshape(broadcast.39)
-    broadcast.41 = f32[16,128,768]{2,1,0} broadcast(reshape.40), dimensions={0,1}
-    Arg_1.2 = f32[768]{0} parameter(1), sharding={replicated}
-    reshape.38 = f32[1,1,768]{2,1,0} reshape(Arg_1.2)
-    broadcast.42 = f32[1,1,768]{2,1,0} broadcast(reshape.38), dimensions={0,1,2}
-    reshape.43 = f32[768]{0} reshape(broadcast.42)
-    broadcast.44 = f32[16,128,768]{2,1,0} broadcast(reshape.43), dimensions={2}
-    multiply.45 = f32[16,128,768]{2,1,0} multiply(broadcast.41, broadcast.44)
-    multiply.46 = f32[16,128,768]{2,1,0} multiply(subtract.35, multiply.45)
-    Arg_0.1 = f32[768]{0} parameter(0), sharding={replicated}
-    reshape.47 = f32[1,1,768]{2,1,0} reshape(Arg_0.1)
-    broadcast.48 = f32[1,1,768]{2,1,0} broadcast(reshape.47), dimensions={0,1,2}
-    reshape.49 = f32[768]{0} reshape(broadcast.48)
-    broadcast.50 = f32[16,128,768]{2,1,0} broadcast(reshape.49), dimensions={2}
-    add.51 = f32[16,128,768]{2,1,0} add(multiply.46, broadcast.50)
-    ROOT convert.52 = bf16[16,128,768]{2,1,0} convert(add.51)
+  EXPECT_TRUE(RunAndCompare(layer_norm_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(layer_norm_module_str, onednn_layer_norm_);
+}
+
+// Test case encountered in models like TFViTForImageClassification in
+// HuggingFace
+// (https://huggingface.co/docs/transformers/model_doc/vit#transformers.TFViTForImageClassification)
+TEST_F(LayerNormTest, LayerNormTest1_BF16) {
+  const char* layer_norm_module_str = R"(
+  HloModule layer_norm.test
+  region_add {
+    Arg_0.7555 = f32[] parameter(0)
+    Arg_1.7556 = f32[] parameter(1)
+    ROOT add.7557 = f32[] add(Arg_0.7555, Arg_1.7556)
+  }
+  ENTRY main {
+    Arg_0.1 = bf16[160,197,768] parameter(0), sharding={replicated}
+    Arg_0.2 = bf16[768] parameter(1), sharding={replicated}
+    Arg_0.3 = bf16[768] parameter(2), sharding={replicated}
+    convert.80 = f32[160,197,768] convert(Arg_0.1)
+    constant.81 = f32[] constant(0)
+    convert.82 = f32[] convert(constant.81)
+    reduce.87 = f32[160,197] reduce(convert.80, convert.82), dimensions={2}, to_apply=region_add
+    constant.88 = s32[] constant(768)
+    convert.89 = f32[] convert(constant.88)
+    broadcast.90 = f32[160,197] broadcast(convert.89), dimensions={}
+    divide.91 = f32[160,197] divide(reduce.87, broadcast.90)
+    convert.92 = bf16[160,197] convert(divide.91)
+    reshape.93 = bf16[160,197,1] reshape(convert.92)
+    reshape.94 = bf16[160,197] reshape(reshape.93)
+    broadcast.95 = bf16[160,197,768] broadcast(reshape.94), dimensions={0,1}
+    subtract.96 = bf16[160,197,768] subtract(Arg_0.1, broadcast.95)
+    multiply.97 = bf16[160,197,768] multiply(subtract.96, subtract.96)
+    convert.98 = f32[160,197,768] convert(multiply.97)
+    constant.99 = f32[] constant(0)
+    convert.100 = f32[] convert(constant.99)
+    reduce.105 = f32[160,197] reduce(convert.98, convert.100), dimensions={2}, to_apply=region_add
+    constant.106 = s32[] constant(768)
+    convert.107 = f32[] convert(constant.106)
+    broadcast.108 = f32[160,197] broadcast(convert.107), dimensions={}
+    divide.109 = f32[160,197] divide(reduce.105, broadcast.108)
+    convert.110 = bf16[160,197] convert(divide.109)
+    reshape.111 = bf16[160,197,1] reshape(convert.110)
+    constant.112 = bf16[] constant(1.002e-12)
+    broadcast.113 = bf16[160,197,1] broadcast(constant.112), dimensions={}
+    add.114 = bf16[160,197,1] add(reshape.111, broadcast.113)
+    rsqrt.115 = bf16[160,197,1] rsqrt(add.114)
+    reshape.118 = bf16[160,197] reshape(rsqrt.115)
+    broadcast.119 = bf16[160,197,768] broadcast(reshape.118), dimensions={0,1}
+    broadcast.117 = bf16[160,197,768] broadcast(Arg_0.2), dimensions={2}
+    multiply.120 = bf16[160,197,768] multiply(broadcast.119, broadcast.117)
+    multiply.121 = bf16[160,197,768] multiply(Arg_0.1, multiply.120)
+    broadcast.126 = bf16[160,197,768] broadcast(Arg_0.3), dimensions={2}
+    reshape.122 = bf16[160,197] reshape(reshape.93)
+    broadcast.123 = bf16[160,197,768] broadcast(reshape.122), dimensions={0,1}
+    multiply.124 = bf16[160,197,768] multiply(multiply.120, broadcast.123)
+    subtract.127 = bf16[160,197,768] subtract(broadcast.126, multiply.124)
+    ROOT add.128 = bf16[160,197,768] add(multiply.121, subtract.127)
   }
 )";
 
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 8d6a307b029a90..64befeab768c25 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -17,15 +17,20 @@ limitations under the License.
 
 #include <utility>
 
+#include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal.h"
+#include "xla/service/cpu/onednn_matmul_rewriter.h"
 #include "xla/service/cpu/onednn_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
+#include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_macros.h"
 #include "tsl/platform/cpu_info.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace cpu {
 
@@ -62,12 +67,12 @@ class MatmulTest : public HloTestBase {
 
 TEST_F(MatmulTest, SimpleTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,8,128,64]{3,2,1,0},f32[32,8,64,128]{3,2,1,0})->f32[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = f32[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -82,12 +87,30 @@ TEST_F(MatmulTest, SimpleTestBF16) {
   }
 
   const char* matmul_module_str = R"(
-  HloModule matmul.test.bf16, entry_computation_layout={(bf16[32,8,128,64]{3,2,1,0},bf16[32,8,64,128]{3,2,1,0})->bf16[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.bf16
 
   ENTRY matmul.test.bf16 {
-    arg.0 = bf16[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = bf16[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = bf16[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = bf16[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = bf16[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = bf16[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestF16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f16
+
+  ENTRY matmul.test.f16 {
+    arg.0 = f16[32,8,128,64] parameter(0), parameter_replication={false}
+    arg.1 = f16[32,8,64,128] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f16[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
@@ -96,12 +119,12 @@ TEST_F(MatmulTest, SimpleTestBF16) {
 
 TEST_F(MatmulTest, SimpleTestF32TransposeB) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,128,64]{3,1,2,0},f32[32,8,128,64]{3,1,2,0})->f32[32,8,128,128]{3,2,1,0}}
+  HloModule matmul.test.1
 
   ENTRY matmul.test.1 {
     arg.0 = f32[32,8,128,64]{3,1,2,0} parameter(0), parameter_replication={false}
     arg.1 = f32[32,8,128,64]{3,1,2,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+    ROOT onednn.matmul.0 = f32[32,8,128,128] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -110,21 +133,21 @@ TEST_F(MatmulTest, SimpleTestF32TransposeB) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion1) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    reshape.2 = f32[32,32,40,30]{3,2,1,0} reshape(arg0.1)
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    reshape.2 = f32[32,32,40,30] reshape(arg0.1)
     constant.3 = f32[] constant(1)
-    broadcast.4 = f32[32,32,30,40]{3,2,1,0} broadcast(constant.3), dimensions={}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(reshape.2, broadcast.4), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broadcast.4 = f32[32,32,30,40] broadcast(constant.3), dimensions={}
+    dot.7 = f32[32,32,40,40] dot(reshape.2, broadcast.4), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[40]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[32,32,40,40]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, broadcast.9)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[40] broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[32,32,40,40] broadcast(broadcast.6), dimensions={3}
+    add.10 = f32[32,32,40,40] add(dot.7, broadcast.9)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -133,21 +156,21 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion1) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion2) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[400,300]{1,0})->f32[400,1,400]{2,1,0}}
+  HloModule matmul.biasadd.test.f32
   
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[400,300]{1,0} parameter(0), parameter_replication={false}
-    reshape.2 = f32[400,300]{1,0} reshape(arg0.1)
+    arg0.1 = f32[400,300] parameter(0), parameter_replication={false}
+    reshape.2 = f32[400,300] reshape(arg0.1)
     constant.3 = f32[] constant(1)
-    broadcast.4 = f32[300,400]{1,0} broadcast(constant.3), dimensions={}
-    dot.7 = f32[400,400]{1,0} dot(reshape.2, broadcast.4), lhs_batch_dims={}, lhs_contracting_dims={1}, rhs_batch_dims={}, rhs_contracting_dims={0}
-    reshape.1 = f32[400,1,400]{2,1,0} reshape(dot.7)
+    broadcast.4 = f32[300,400] broadcast(constant.3), dimensions={}
+    dot.7 = f32[400,400] dot(reshape.2, broadcast.4), lhs_batch_dims={}, lhs_contracting_dims={1}, rhs_batch_dims={}, rhs_contracting_dims={0}
+    reshape.1 = f32[400,1,400] reshape(dot.7)
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[400]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[400,1,400]{2,1,0} broadcast(broadcast.6), dimensions={2}
-    add.10 = f32[400,1,400]{2,1,0} add(reshape.1, broadcast.9)
-    tuple.12 = (f32[400,1,400]{2,1,0}) tuple(add.10)
-    ROOT get-tuple-element.13 = f32[400,1,400]{2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[400] broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[400,1,400] broadcast(broadcast.6), dimensions={2}
+    add.10 = f32[400,1,400] add(reshape.1, broadcast.9)
+    tuple.12 = (f32[400,1,400]) tuple(add.10)
+    ROOT get-tuple-element.13 = f32[400,1,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -156,17 +179,17 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion2) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter1) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0}, f32[32,32,30,40]{3,2,1,0}, f32[32,32,40,40]{3,2,1,0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[32,32,30,40]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[32,32,40,40]{3,2,1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, arg0.3)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
+    arg0.3 = f32[32,32,40,40] parameter(2), parameter_replication={false}
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    add.10 = f32[32,32,40,40] add(dot.7, arg0.3)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -175,18 +198,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter1) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[32,32,40,30]{3,2,1,0}, f32[32,32,30,40]{3,2,1,0}, f32[40]{0})->f32[32,32,40,40]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[32,32,40,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[32,32,30,40]{3,2,1,0} parameter(1), parameter_replication={false}
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
     arg0.3 = f32[40]{0} parameter(2), parameter_replication={false}
-    dot.7 = f32[32,32,40,40]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[32,32,40,40]{3,2,1,0} broadcast(arg0.3), dimensions={3}
-    add.10 = f32[32,32,40,40]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[32,32,40,40]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,32,40,40]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,32,40,40]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[32,32,40,40] broadcast(arg0.3), dimensions={3}
+    add.10 = f32[32,32,40,40] add(dot.7, broad.1)
+    reshape.11 = f32[32,32,40,40] reshape(add.10)
+    tuple.12 = (f32[32,32,40,40]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -195,18 +218,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[2,2,400,30]{3,2,1,0}, f32[2,2,30,400]{3,2,1,0}, f32[2,400]{1,0})->f32[2,2,400,400]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[2,2,400,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[2,2,30,400]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[2,400]{1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[2,2,400,400]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[2,2,400,400]{3,2,1,0} broadcast(arg0.3), dimensions={0,3}
-    add.10 = f32[2,2,400,400]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[2,2,400,400]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[2,2,400,400]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[2,2,400,400]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[2,2,400,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[2,2,30,400] parameter(1), parameter_replication={false}
+    arg0.3 = f32[2,400] parameter(2), parameter_replication={false}
+    dot.7 = f32[2,2,400,400] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[2,2,400,400] broadcast(arg0.3), dimensions={0,3}
+    add.10 = f32[2,2,400,400] add(dot.7, broad.1)
+    reshape.11 = f32[2,2,400,400] reshape(add.10)
+    tuple.12 = (f32[2,2,400,400]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[2,2,400,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -215,18 +238,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D1B) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[1,2,400,30]{3,2,1,0}, f32[1,2,30,400]{3,2,1,0}, f32[1,400]{1,0})->f32[1,2,400,400]{3,2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[1,2,400,30]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg0.2 = f32[1,2,30,400]{3,2,1,0} parameter(1), parameter_replication={false}
-    arg0.3 = f32[1,400]{1,0} parameter(2), parameter_replication={false}
-    dot.7 = f32[1,2,400,400]{3,2,1,0} dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    broad.1 = f32[1,2,400,400]{3,2,1,0} broadcast(arg0.3), dimensions={0,3}
-    add.10 = f32[1,2,400,400]{3,2,1,0} add(dot.7, broad.1)
-    reshape.11 = f32[1,2,400,400]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[1,2,400,400]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[1,2,400,400]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    arg0.1 = f32[1,2,400,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[1,2,30,400] parameter(1), parameter_replication={false}
+    arg0.3 = f32[1,400] parameter(2), parameter_replication={false}
+    dot.7 = f32[1,2,400,400] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    broad.1 = f32[1,2,400,400] broadcast(arg0.3), dimensions={0,3}
+    add.10 = f32[1,2,400,400] add(dot.7, broad.1)
+    reshape.11 = f32[1,2,400,400] reshape(add.10)
+    tuple.12 = (f32[1,2,400,400]) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[1,2,400,400] get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -235,18 +258,18 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter2D1B) {
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter3) {
   const char* matmul_module_str = R"(
-  HloModule matmul.biasadd.test.f32, entry_computation_layout={(f32[16,128,768]{2,1,0}, f32[768,768]{1,0}, f32[768]{0})->f32[16,128,768]{2,1,0}}
-  
+  HloModule matmul.biasadd.test.f32
+
   ENTRY matmul.biasadd.test.f32 {
-    arg0.1 = f32[16,128,768]{2,1,0} parameter(0), sharding={replicated}
-    arg0.2 = f32[768,768]{1,0} parameter(1), sharding={replicated}
-    dot.84 = f32[16,128,768]{2,1,0} dot(arg0.1, arg0.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg0.1 = f32[16,128,768] parameter(0), sharding={replicated}
+    arg0.2 = f32[768,768] parameter(1), sharding={replicated}
+    dot.84 = f32[16,128,768] dot(arg0.1, arg0.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
     arg0.3 = f32[768]{0} parameter(2), sharding={replicated}
-    reshape.85 = f32[1,1,768]{2,1,0} reshape(arg0.3)
-    broadcast.86 = f32[1,1,768]{2,1,0} broadcast(reshape.85), dimensions={0,1,2}
+    reshape.85 = f32[1,1,768] reshape(arg0.3)
+    broadcast.86 = f32[1,1,768] broadcast(reshape.85), dimensions={0,1,2}
     reshape.87 = f32[768]{0} reshape(broadcast.86)
-    broadcast.88 = f32[16,128,768]{2,1,0} broadcast(reshape.87), dimensions={2}
-    ROOT add.89 = f32[16,128,768]{2,1,0} add(dot.84, broadcast.88)
+    broadcast.88 = f32[16,128,768] broadcast(reshape.87), dimensions={2}
+    ROOT add.89 = f32[16,128,768] add(dot.84, broadcast.88)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -255,19 +278,19 @@ TEST_F(MatmulTest, SimpleTestF32WithBiasAsParameter3) {
 
 TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,4,16]{3,1,2,0},f32[32,8,4,16]{3,1,2,0})->f32[32,8,4,4]{3,2,1,0}}
-  
+  HloModule matmul.test.1
+
   ENTRY matmul.test.1 {
     arg.0 = f32[32,8,4,16]{3,1,2,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,8,4,16]{3,1,2,0} parameter(1), parameter_replication={false}
-    dot.7 = f32[32,8,4,4]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+    arg.1 = f32[32,8,16,16]{3,1,2,0} parameter(1), parameter_replication={false}
+    dot.7 = f32[32,8,4,16]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
     constant.5 = f32[] constant(15)
-    broadcast.6 = f32[4]{0} broadcast(constant.5), dimensions={}
-    broadcast.9 = f32[32,8,4,4]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
-    add.10 = f32[32,8,4,4]{3,2,1,0} add(dot.7, broadcast.9)
-    reshape.11 = f32[32,8,4,4]{3,2,1,0} reshape(add.10)
-    tuple.12 = (f32[32,8,4,4]{3,2,1,0}) tuple(reshape.11)
-    ROOT get-tuple-element.13 = f32[32,8,4,4]{3,2,1,0} get-tuple-element(tuple.12), index=0
+    broadcast.6 = f32[16]{0} broadcast(constant.5), dimensions={}
+    broadcast.9 = f32[32,8,4,16]{3,2,1,0} broadcast(broadcast.6), dimensions={3}
+    add.10 = f32[32,8,4,16]{3,2,1,0} add(dot.7, broadcast.9)
+    reshape.11 = f32[32,8,4,16]{3,2,1,0} reshape(add.10)
+    tuple.12 = (f32[32,8,4,16]{3,2,1,0}) tuple(reshape.11)
+    ROOT get-tuple-element.13 = f32[32,8,4,16]{3,2,1,0} get-tuple-element(tuple.12), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -276,19 +299,19 @@ TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
 
 TEST_F(MatmulTest, F32BiasAddFusionNonCompatibleBias) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[12288,2]{1,0},f32[2,1024]{1,0})->f32[32,384,1024]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.1 {
-    arg.0 = f32[12288,2]{1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[2,1024]{1,0} parameter(1), parameter_replication={false}
-    dot.0 = f32[12288,1024]{1,0} dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    reshape.0 = f32[32,384,1024]{2,1,0} reshape(dot.0)
-    constant.0 = f32[1,384,1024]{2,1,0} constant(15)
-    reshape.1 = f32[384,1024]{1,0} reshape(constant.0)
-    broadcast.0 = f32[32,384,1024]{2,1,0} broadcast(reshape.1), dimensions={1,2}
-    add.0 = f32[32,384,1024]{2,1,0} add(reshape.0, broadcast.0)
-    tuple.0 = (f32[32,384,1024]{2,1,0}) tuple(add.0)
-    ROOT get-tuple-element.0 = f32[32,384,1024]{2,1,0} get-tuple-element(tuple.0), index=0
+    arg.0 = f32[12288,2] parameter(0), parameter_replication={false}
+    arg.1 = f32[2,1024] parameter(1), parameter_replication={false}
+    dot.0 = f32[12288,1024] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    reshape.0 = f32[32,384,1024] reshape(dot.0)
+    constant.0 = f32[1,384,1024] constant(15)
+    reshape.1 = f32[384,1024] reshape(constant.0)
+    broadcast.0 = f32[32,384,1024] broadcast(reshape.1), dimensions={1,2}
+    add.0 = f32[32,384,1024] add(reshape.0, broadcast.0)
+    tuple.0 = (f32[32,384,1024]) tuple(add.0)
+    ROOT get-tuple-element.0 = f32[32,384,1024] get-tuple-element(tuple.0), index=0
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -297,29 +320,29 @@ TEST_F(MatmulTest, F32BiasAddFusionNonCompatibleBias) {
 
 TEST_F(MatmulTest, ApproxGELUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,4,16]{3,2,1,0},f32[32,32,16,32]{3,2,1,0})->f32[32,32,4,32]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,32,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,32,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    onednn.matmul.0 = f32[32,32,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    mul.0 = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, onednn.matmul.0)
-    mul.1 = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, mul.0)
+    arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+    onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    mul.0 = f32[32,32,4,32] multiply(onednn.matmul.0, onednn.matmul.0)
+    mul.1 = f32[32,32,4,32] multiply(onednn.matmul.0, mul.0)
     const.0 = f32[] constant(0.044715)
-    bcast.0 = f32[32,32,4,32]{3,2,1,0} broadcast(const.0), dimensions={}
-    mul.2 = f32[32,32,4,32]{3,2,1,0} multiply(mul.1, bcast.0)
-    add.0 = f32[32,32,4,32]{3,2,1,0} add(onednn.matmul.0, mul.2)
+    bcast.0 = f32[32,32,4,32] broadcast(const.0), dimensions={}
+    mul.2 = f32[32,32,4,32] multiply(mul.1, bcast.0)
+    add.0 = f32[32,32,4,32] add(onednn.matmul.0, mul.2)
     const.1 = f32[] constant(0.797884583)
-    bcast.1 = f32[32,32,4,32]{3,2,1,0} broadcast(const.1), dimensions={}
-    mul.3 = f32[32,32,4,32]{3,2,1,0} multiply(add.0, bcast.1)
-    tanh = f32[32,32,4,32]{3,2,1,0} tanh(mul.3)
+    bcast.1 = f32[32,32,4,32] broadcast(const.1), dimensions={}
+    mul.3 = f32[32,32,4,32] multiply(add.0, bcast.1)
+    tanh = f32[32,32,4,32] tanh(mul.3)
     const.2 = f32[] constant(1)
-    bcast.2 = f32[32,32,4,32]{3,2,1,0} broadcast(const.2), dimensions={}
-    add.2 = f32[32,32,4,32]{3,2,1,0} add(tanh, bcast.2)
+    bcast.2 = f32[32,32,4,32] broadcast(const.2), dimensions={}
+    add.2 = f32[32,32,4,32] add(tanh, bcast.2)
     const.3 = f32[] constant(0.5)
-    bcast.3 = f32[32,32,4,32]{3,2,1,0} broadcast(const.3), dimensions={}
-    mul.4 = f32[32,32,4,32]{3,2,1,0} multiply(add.2, bcast.3)
-    ROOT out = f32[32,32,4,32]{3,2,1,0} multiply(onednn.matmul.0, mul.4)
+    bcast.3 = f32[32,32,4,32] broadcast(const.3), dimensions={}
+    mul.4 = f32[32,32,4,32] multiply(add.2, bcast.3)
+    ROOT out = f32[32,32,4,32] multiply(onednn.matmul.0, mul.4)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -339,35 +362,35 @@ TEST_F(MatmulTest, ApproxGELUTestF32) {
 // batch=32; seq_len=32; hidden_size=64; intermediate_size=256
 TEST_F(MatmulTest, BiasAndApproxGELUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,64]{2,1,0}, f32[64,256]{1,0}, f32[256]{0})->f32[32,32,256]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-  Arg_5.6 = f32[32,32,64]{2,1,0} parameter(0), sharding={replicated}
-  Arg_7.8 = f32[64,256]{1,0} parameter(1), sharding={replicated}
-  dot.232 = f32[32,32,256]{2,1,0} dot(Arg_5.6, Arg_7.8), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-  Arg_6.7 = f32[256]{0} parameter(2), sharding={replicated}
-  reshape.233 = f32[1,1,256]{2,1,0} reshape(Arg_6.7)
-  broadcast.234 = f32[1,1,256]{2,1,0} broadcast(reshape.233), dimensions={0,1,2}
-  reshape.235 = f32[256]{0} reshape(broadcast.234)
-  broadcast.236 = f32[32,32,256]{2,1,0} broadcast(reshape.235), dimensions={2}
-  add.237 = f32[32,32,256]{2,1,0} add(dot.232, broadcast.236)
-  multiply.238 = f32[32,32,256]{2,1,0} multiply(add.237, add.237)
-  multiply.239 = f32[32,32,256]{2,1,0} multiply(add.237, multiply.238)
+  Arg_5.6 = f32[32,32,64] parameter(0), sharding={replicated}
+  Arg_7.8 = f32[64,256] parameter(1), sharding={replicated}
+  dot.232 = f32[32,32,256] dot(Arg_5.6, Arg_7.8), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  Arg_6.7 = f32[256] parameter(2), sharding={replicated}
+  reshape.233 = f32[1,1,256] reshape(Arg_6.7)
+  broadcast.234 = f32[1,1,256] broadcast(reshape.233), dimensions={0,1,2}
+  reshape.235 = f32[256] reshape(broadcast.234)
+  broadcast.236 = f32[32,32,256] broadcast(reshape.235), dimensions={2}
+  add.237 = f32[32,32,256] add(dot.232, broadcast.236)
+  multiply.238 = f32[32,32,256] multiply(add.237, add.237)
+  multiply.239 = f32[32,32,256] multiply(add.237, multiply.238)
   constant.20 = f32[] constant(0.044715)
-  broadcast.21 = f32[32,32,256]{2,1,0} broadcast(constant.20), dimensions={}
-  multiply.240 = f32[32,32,256]{2,1,0} multiply(multiply.239, broadcast.21)
-  add.241 = f32[32,32,256]{2,1,0} add(add.237, multiply.240)
+  broadcast.21 = f32[32,32,256] broadcast(constant.20), dimensions={}
+  multiply.240 = f32[32,32,256] multiply(multiply.239, broadcast.21)
+  add.241 = f32[32,32,256] add(add.237, multiply.240)
   constant.18 = f32[] constant(0.797884583)
-  broadcast.19 = f32[32,32,256]{2,1,0} broadcast(constant.18), dimensions={}
-  multiply.242 = f32[32,32,256]{2,1,0} multiply(add.241, broadcast.19)
-  tanh.243 = f32[32,32,256]{2,1,0} tanh(multiply.242)
+  broadcast.19 = f32[32,32,256] broadcast(constant.18), dimensions={}
+  multiply.242 = f32[32,32,256] multiply(add.241, broadcast.19)
+  tanh.243 = f32[32,32,256] tanh(multiply.242)
   constant.16 = f32[] constant(1)
-  broadcast.17 = f32[32,32,256]{2,1,0} broadcast(constant.16), dimensions={}
-  add.244 = f32[32,32,256]{2,1,0} add(tanh.243, broadcast.17)
+  broadcast.17 = f32[32,32,256] broadcast(constant.16), dimensions={}
+  add.244 = f32[32,32,256] add(tanh.243, broadcast.17)
   constant.14 = f32[] constant(0.5)
-  broadcast.15 = f32[32,32,256]{2,1,0} broadcast(constant.14), dimensions={}
-  multiply.245 = f32[32,32,256]{2,1,0} multiply(add.244, broadcast.15)
-  ROOT out = f32[32,32,256]{2,1,0} multiply(add.237, multiply.245)
+  broadcast.15 = f32[32,32,256] broadcast(constant.14), dimensions={}
+  multiply.245 = f32[32,32,256] multiply(add.244, broadcast.15)
+  ROOT out = f32[32,32,256] multiply(add.237, multiply.245)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -385,20 +408,20 @@ TEST_F(MatmulTest, BiasAndApproxGELUTestF32) {
 
 TEST_F(MatmulTest, ReLUTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,4,16]{3,2,1,0},f32[32,32,16,32]{3,2,1,0})->f32[32,32,4,32]{3,2,1,0}}
+  HloModule matmul.test.f32
 
   relu.1 {
-    Arg_0.3 = f32[32,32,4,32]{3,2,1,0} parameter(0)
+    Arg_0.3 = f32[32,32,4,32] parameter(0)
     constant.4 = f32[] constant(0)
-    broadcast.5 = f32[32,32,4,32]{3,2,1,0} broadcast(constant.4), dimensions={}
-    ROOT maximum.6 = f32[32,32,4,32]{3,2,1,0} maximum(Arg_0.3, broadcast.5)
+    broadcast.5 = f32[32,32,4,32] broadcast(constant.4), dimensions={}
+    ROOT maximum.6 = f32[32,32,4,32] maximum(Arg_0.3, broadcast.5)
   }
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[32,32,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[32,32,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    onednn.matmul.0 = f32[32,32,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
-    ROOT call.7 = f32[32,32,4,32]{3,2,1,0} call(onednn.matmul.0), to_apply=relu.1
+    arg.0 = f32[32,32,4,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[32,32,16,32] parameter(1), parameter_replication={false}
+    onednn.matmul.0 = f32[32,32,4,32] dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    ROOT call.7 = f32[32,32,4,32] call(onednn.matmul.0), to_apply=relu.1
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -415,21 +438,26 @@ TEST_F(MatmulTest, ReLUTestF32) {
 }
 
 TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(f32[3072]{0}, f32[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.bf16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = f32[768,3072]{1,0} parameter(1), sharding={replicated}
-    convert.5 = bf16[768,3072]{1,0} convert(Arg_1.2)
-    dot.7 = bf16[16,128,3072]{2,1,0} dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = f32[3072]{0} parameter(0), sharding={replicated}
-    convert.6 = bf16[3072]{0} convert(Arg_0.1)
-    reshape.8 = bf16[1,1,3072]{2,1,0} reshape(convert.6)
-    broadcast.9 = bf16[1,1,3072]{2,1,0} broadcast(reshape.8), dimensions={0,1,2}
-    reshape.10 = bf16[3072]{0} reshape(broadcast.9)
-    broadcast.11 = bf16[16,128,3072]{2,1,0} broadcast(reshape.10), dimensions={2}
-    ROOT add.12 = bf16[16,128,3072]{2,1,0} add(dot.7, broadcast.11)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072] parameter(1), sharding={replicated}
+    convert.5 = bf16[768,3072] convert(Arg_1.2)
+    dot.7 = bf16[16,128,3072] dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072] parameter(0), sharding={replicated}
+    convert.6 = bf16[3072] convert(Arg_0.1)
+    reshape.8 = bf16[1,1,3072] reshape(convert.6)
+    broadcast.9 = bf16[1,1,3072] broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = bf16[3072] reshape(broadcast.9)
+    broadcast.11 = bf16[16,128,3072] broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = bf16[16,128,3072] add(dot.7, broadcast.11)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -437,19 +465,24 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_F32) {
 }
 
 TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const char* matmul_module_str = R"(
-  HloModule jit_apply, entry_computation_layout={(bf16[3072]{0}, bf16[768,3072]{1,0}, f32[16,128,768]{2,1,0})->bf16[16,128,3072]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+  HloModule jit_apply
+
   ENTRY matmul.test.bf16 {
-    Arg_2.3 = f32[16,128,768]{2,1,0} parameter(2), sharding={replicated}
-    convert.4 = bf16[16,128,768]{2,1,0} convert(Arg_2.3)
-    Arg_1.2 = bf16[768,3072]{1,0} parameter(1), sharding={replicated}
-    dot.5 = bf16[16,128,3072]{2,1,0} dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
-    Arg_0.1 = bf16[3072]{0} parameter(0), sharding={replicated}
-    reshape.6 = bf16[1,1,3072]{2,1,0} reshape(Arg_0.1)
-    broadcast.7 = bf16[1,1,3072]{2,1,0} broadcast(reshape.6), dimensions={0,1,2}
-    reshape.8 = bf16[3072]{0} reshape(broadcast.7)
-    broadcast.9 = bf16[16,128,3072]{2,1,0} broadcast(reshape.8), dimensions={2}
-    ROOT add.10 = bf16[16,128,3072]{2,1,0} add(dot.5, broadcast.9)
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = bf16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = bf16[768,3072] parameter(1), sharding={replicated}
+    dot.5 = bf16[16,128,3072] dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = bf16[3072] parameter(0), sharding={replicated}
+    reshape.6 = bf16[1,1,3072] reshape(Arg_0.1)
+    broadcast.7 = bf16[1,1,3072] broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = bf16[3072] reshape(broadcast.7)
+    broadcast.9 = bf16[16,128,3072] broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = bf16[16,128,3072] add(dot.5, broadcast.9)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
@@ -458,14 +491,15 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
 
 TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.divide.test.1, entry_computation_layout={(f32[16,128,768]{2,1,0}, f32[768,12,64]{2,1,0})->f32[16,128,12,64]{3,2,1,0}}
+  HloModule matmul.divide.test.1
+
   ENTRY matmul.divide.test.f32 {
-    Arg_4.5 = f32[16,128,768]{2,1,0} parameter(0), sharding={replicated}
-    Arg_2.3 = f32[768,12,64]{2,1,0} parameter(1), sharding={replicated}
-    onednn.matmul.0 = f32[16,128,12,64]{3,2,1,0} dot(Arg_4.5, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_4.5 = f32[16,128,768] parameter(0), sharding={replicated}
+    Arg_2.3 = f32[768,12,64] parameter(1), sharding={replicated}
+    onednn.matmul.0 = f32[16,128,12,64] dot(Arg_4.5, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
     constant.8 = f32[] constant(8)
-    broadcast.9 = f32[16,128,12,64]{3,2,1,0} broadcast(constant.8), dimensions={}
-    ROOT divide.16 = f32[16,128,12,64]{3,2,1,0} divide(onednn.matmul.0, broadcast.9)
+    broadcast.9 = f32[16,128,12,64] broadcast(constant.8), dimensions={}
+    ROOT divide.16 = f32[16,128,12,64] divide(onednn.matmul.0, broadcast.9)
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec(1e-4, 1e-4)));
@@ -481,14 +515,65 @@ TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F32) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule jit_apply
+
+  ENTRY matmul.test.f16 {
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f32[768,3072] parameter(1), sharding={replicated}
+    convert.5 = f16[768,3072] convert(Arg_1.2)
+    dot.7 = f16[16,128,3072] dot(convert.4, convert.5), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f32[3072] parameter(0), sharding={replicated}
+    convert.6 = f16[3072] convert(Arg_0.1)
+    reshape.8 = f16[1,1,3072] reshape(convert.6)
+    broadcast.9 = f16[1,1,3072] broadcast(reshape.8), dimensions={0,1,2}
+    reshape.10 = f16[3072] reshape(broadcast.9)
+    broadcast.11 = f16[16,128,3072] broadcast(reshape.10), dimensions={2}
+    ROOT add.12 = f16[16,128,3072] add(dot.7, broadcast.11)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
+TEST_F(MatmulTest, SimpleBiasTestF16_PARAM_F16) {
+  if (!IsSupportedType(PrimitiveType::F16)) {
+    GTEST_SKIP() << "CPU does not support F16.";
+  }
+  const char* matmul_module_str = R"(
+  HloModule jit_apply
+
+  ENTRY matmul.test.f16 {
+    Arg_2.3 = f32[16,128,768] parameter(2), sharding={replicated}
+    convert.4 = f16[16,128,768] convert(Arg_2.3)
+    Arg_1.2 = f16[768,3072] parameter(1), sharding={replicated}
+    dot.5 = f16[16,128,3072] dot(convert.4, Arg_1.2), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    Arg_0.1 = f16[3072] parameter(0), sharding={replicated}
+    reshape.6 = f16[1,1,3072] reshape(Arg_0.1)
+    broadcast.7 = f16[1,1,3072] broadcast(reshape.6), dimensions={0,1,2}
+    reshape.8 = f16[3072] reshape(broadcast.7)
+    broadcast.9 = f16[16,128,3072] broadcast(reshape.8), dimensions={2}
+    ROOT add.10 = f16[16,128,3072] add(dot.5, broadcast.9)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-2}));
+  MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
+}
+
 TEST_F(MatmulTest, TestF32NonConstantWeights) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0},f32[16,32]{1,0})->f32[64,256,32]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[16,32]{1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg.0 = f32[64,256,16] parameter(0), parameter_replication={false}
+    arg.1 = f32[16,32] parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[64,256,32] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -502,13 +587,13 @@ TEST_F(MatmulTest, TestF32NonConstantWeights) {
 
 TEST_F(MatmulTest, TestF32ConstantWeights) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0})->f32[64,256,32]{2,1,0}}
+  HloModule matmul.test.f32
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
+    arg.0 = f32[64,256,16] parameter(0), parameter_replication={false}
     constant = f32[] constant(1)
-    arg.1 = f32[16,32]{1,0} broadcast(constant), dimensions={}
-    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    arg.1 = f32[16,32] broadcast(constant), dimensions={}
+    ROOT onednn.matmul.0 = f32[64,256,32] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
@@ -520,6 +605,93 @@ TEST_F(MatmulTest, TestF32ConstantWeights) {
   )");
 }
 
+TEST_F(MatmulTest, SimpleTestBF16Gemv1) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+
+  ENTRY matmul.test.bf16 {
+    arg.0 = bf16[1000,10000] parameter(0)
+    arg.1 = bf16[10000] parameter(1)
+    ROOT onednn.matmul.0 = bf16[1000] dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{2e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
+TEST_F(MatmulTest, SimpleTestBF16Gemv2) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.bf16
+  
+  ENTRY matmul.test.bf16 {
+    arg.0 = bf16[100,300,300] parameter(0)
+    arg.1 = bf16[300] parameter(1)
+    ROOT onednn.matmul.0 = bf16[100,300] dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{2e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
+TEST_F(MatmulTest, TestTransposeBNoRewriteF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32
+
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[384,1024]{1,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[2,1024]{1,0} parameter(1), parameter_replication={false}
+    ROOT dot.2 = f32[384,2]{1,0} dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     %matmul.test.f32
+  ; CHECK-NOT: custom_call_target="__onednn$matmul",
+  ; CHECK:     f32[384,2]{1,0} dot(%arg.0, %arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  )");
+}
+
+TEST_F(MatmulTest, SimpleTestF32WithMulAndAddFusion) {
+  const char* matmul_module_str = R"(
+  ENTRY matmul.mul.add.test.f32 {
+    arg0.1 = f32[32,32,40,30] parameter(0), parameter_replication={false}
+    arg0.2 = f32[32,32,30,40] parameter(1), parameter_replication={false}
+    dot.7 = f32[32,32,40,40] dot(arg0.1, arg0.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    const.0 = f32[] constant(0.044715)
+    bcast.0 = f32[32,32,40,40] broadcast(const.0), dimensions={}
+    mul.0 = f32[32,32,40,40] multiply(dot.7,bcast.0)
+    const.1 = f32[] constant(0.65)
+    bcast.1 = f32[32,32,40,40] broadcast(const.1), dimensions={}
+    add.0 = f32[32,32,40,40] add(mul.0, bcast.1)
+    const.2 = f32[] constant(0.65)
+    bcast.2 = f32[32,32,40,40] broadcast(const.2), dimensions={}
+    add.1 = f32[32,32,40,40] add(bcast.2, bcast.1)
+    tuple.12 = (f32[32,32,40,40]) tuple(add.0)
+    ROOT get-tuple-element.13 = f32[32,32,40,40] get-tuple-element(tuple.12), index=0
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":["LINEAR","BINARY_ADD"]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tests/onednn_softmax_test.cc b/third_party/xla/xla/tests/onednn_softmax_test.cc
index 8506337f194a3a..4af19eafa732d0 100644
--- a/third_party/xla/xla/tests/onednn_softmax_test.cc
+++ b/third_party/xla/xla/tests/onednn_softmax_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/literal.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
@@ -101,6 +102,10 @@ TEST_F(OneDnnSoftmaxTest, SoftmaxFP32) {
 }
 
 TEST_F(OneDnnSoftmaxTest, SoftmaxBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
   const std::string hlo_string = R"(
         HloModule jit_softmax, entry_computation_layout={(bf16[1,128,30522]{2,1,0})->bf16[1,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
         region_0.4 {
@@ -136,6 +141,47 @@ TEST_F(OneDnnSoftmaxTest, SoftmaxBF16) {
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
 }
 
+TEST_F(OneDnnSoftmaxTest, SoftmaxF32toBF16) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
+    GTEST_SKIP() << "CPU does not support BF16.";
+  }
+
+  const std::string hlo_string = R"(
+        HloModule jit_softmax, entry_computation_layout={(f32[16,128,30522]{2,1,0})->bf16[16,128,30522]{2,1,0}}, allow_spmd_sharding_propagation_to_output={true}
+        region_0.4 {
+            Arg_0.5 = f32[] parameter(0)
+            Arg_1.6 = f32[] parameter(1)
+            ROOT maximum.7 = f32[] maximum(Arg_0.5, Arg_1.6)
+        }
+        region_1.15 {
+            Arg_0.16 = f32[] parameter(0)
+            Arg_1.17 = f32[] parameter(1)
+            ROOT add.18 = f32[] add(Arg_0.16, Arg_1.17)
+        }
+        ENTRY main.25 {
+            Arg_0.1 = f32[16,128,30522]{2,1,0} parameter(0), sharding={replicated}
+            constant.3 = f32[] constant(-inf)
+            reduce.8 = f32[16,128]{1,0} reduce(Arg_0.1, constant.3), dimensions={2}, to_apply=region_0.4
+            reshape.9 = f32[16,128,1]{2,1,0} reshape(reduce.8)
+            broadcast.10 = f32[16,128,1]{2,1,0} broadcast(reshape.9), dimensions={0,1,2}
+            reshape.11 = f32[16,128]{1,0} reshape(broadcast.10)
+            broadcast.12 = f32[16,128,30522]{2,1,0} broadcast(reshape.11), dimensions={0,1}
+            subtract.13 = f32[16,128,30522]{2,1,0} subtract(Arg_0.1, broadcast.12)
+            exponential.14 = f32[16,128,30522]{2,1,0} exponential(subtract.13)
+            constant.2 = f32[] constant(0)
+            reduce.19 = f32[16,128]{1,0} reduce(exponential.14, constant.2), dimensions={2}, to_apply=region_1.15
+            reshape.20 = f32[16,128,1]{2,1,0} reshape(reduce.19)
+            broadcast.21 = f32[16,128,1]{2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+            reshape.22 = f32[16,128]{1,0} reshape(broadcast.21)
+            broadcast.23 = f32[16,128,30522]{2,1,0} broadcast(reshape.22), dimensions={0,1}
+            divide.24 = f32[16,128,30522]{2,1,0} divide(exponential.14, broadcast.23)
+            ROOT convert.1 = bf16[16,128,30522]{2,1,0} convert(divide.24)
+        }
+    )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-4, 1e-4}));
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tests/verified_hlo_module.h b/third_party/xla/xla/tests/verified_hlo_module.h
index 8ff640803f55e3..52fce003b9ccf1 100644
--- a/third_party/xla/xla/tests/verified_hlo_module.h
+++ b/third_party/xla/xla/tests/verified_hlo_module.h
@@ -35,11 +35,12 @@ class VerifiedHloModule : public HloModule {
   VerifiedHloModule(const std::string& name, const HloModuleConfig& config,
                     bool verifier_layout_sensitive,
                     bool allow_mixed_precision_in_hlo_verifier,
-                    std::function<int64_t(const Shape&)> shape_size_function)
+                    std::function<int64_t(const Shape&)> shape_size_function,
+                    HloPredicate instruction_can_change_layout_func = {})
       : HloModule(name, config),
-        verifier_(
-            verifier_layout_sensitive, allow_mixed_precision_in_hlo_verifier,
-            /*instruction_can_change_layout_func=*/{}, shape_size_function) {}
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier,
+                  instruction_can_change_layout_func, shape_size_function) {}
 
   ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
 
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 34f6e352027f7c..c5d428907741c1 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1,15 +1,7 @@
 # Tools and utilities that aid in XLA development and usage.
 
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//xla:xla.bzl",
-    "xla_cc_binary",
-    "xla_cc_test",
-    "xla_py_proto_library",
-)
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "@local_tsl//tsl:tsl.bzl",
@@ -26,6 +18,14 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "//xla:xla.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+    "xla_py_proto_library",
+)
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -55,6 +55,7 @@ xla_cc_binary(
     srcs = ["hex_floats_to_packed_literal.cc"],
     deps = [
         "//xla:types",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/lib/io:buffered_inputstream",
@@ -63,7 +64,6 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -228,11 +228,11 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla:util",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util:command_line_flags",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -297,13 +297,13 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass_pipeline",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/util:command_line_flags",
     ],
 )
 
@@ -323,7 +323,7 @@ cc_library(
         "//xla/service:sharding_propagation",
         "//xla/service:triangular_solve_expander",
         "//xla/service/spmd:stateful_rng_spmd_partitioner",
-        "@local_tsl//tsl/util:command_line_flags",
+        "//xla/tsl/util:command_line_flags",
     ],
 )
 
@@ -397,6 +397,7 @@ xla_cc_binary(
         "//xla/service:hlo_runner",
         "//xla/service:local_service",
         "//xla/service:platform_util",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -405,7 +406,6 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:subprocess",
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
@@ -508,6 +508,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -561,13 +562,13 @@ xla_cc_binary(
         "//xla/service:hlo_runner",
         "//xla/service:interpreter_plugin",
         "//xla/service:platform_util",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
@@ -652,6 +653,29 @@ xla_cc_binary(
     ],
 )
 
+xla_cc_binary(
+    name = "extract_collective_operations",
+    srcs = ["extract_collective_operations.cc"],
+    deps = [
+        ":hlo_decomposer_lib",
+        ":hlo_module_loader",
+        "//xla:debug_options_flags",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_proto_cc",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 tsl_gpu_library(
     name = "xla_compile_lib",
     srcs = ["xla_compile_lib.cc"],
diff --git a/third_party/xla/xla/tools/extract_collective_operations.cc b/third_party/xla/xla/tools/extract_collective_operations.cc
new file mode 100644
index 00000000000000..1a15fa8fbc1fe6
--- /dev/null
+++ b/third_party/xla/xla/tools/extract_collective_operations.cc
@@ -0,0 +1,96 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "xla/debug_options_flags.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/status.h"
+#include "xla/tools/hlo_decomposer.h"
+#include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/util/command_line_flags.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace {
+const char* const kUsage = R"(
+This tool extracts collective operations from HLO module and saves them together
+to the separate module.
+
+Usage:
+bazel run extract_collective_operations -- --input=path/to/hlo_module
+  --output=path/to/hlo_module
+)";
+}  // namespace
+
+namespace xla {
+Status ExtractCollectiveOperations(const std::string& input,
+                                   const std::string& output) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> test_module,
+      LoadModuleFromFile(input, std::string(tsl::io::Extension(input)),
+                         hlo_module_loader_details::Config(), nullptr));
+
+  std::vector<xla::HloInstruction*> collective_instructions;
+  for (const auto& op : test_module->computations()) {
+    for (const auto& instr : op->instructions()) {
+      if (absl::StartsWith(instr->name(), "all-")) {
+        collective_instructions.push_back(instr);
+      }
+    }
+  }
+
+  if (collective_instructions.empty()) {
+    return absl::InternalError("No collective instructions found.");
+  }
+  auto collectives_module =
+      ExtractInstructionIntoNewModule(collective_instructions);
+
+  QCHECK_OK(tsl::WriteStringToFile(tsl::Env::Default(), output,
+                                   collectives_module->ToString()))
+      << "Can't open or write output module at " << output;
+  return absl::OkStatus();
+}
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  std::string input;
+  std::string output;
+  std::vector<tsl::Flag> flag_list = {
+      tsl::Flag("input", &input, "input file"),
+      tsl::Flag("output", &output, "output file")};
+  xla::AppendDebugOptionsFlags(&flag_list);
+  const std::string kUsageString =
+      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
+  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
+  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
+  if (!parse_ok) {
+    LOG(QFATAL) << kUsageString;
+  }
+  TF_CHECK_OK(xla::ExtractCollectiveOperations(input, output));
+  return 0;
+}
diff --git a/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc b/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
index a845dabe02b3a1..c4d591ba34928a 100644
--- a/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
+++ b/third_party/xla/xla/tools/hex_floats_to_packed_literal.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/types.h"
 #include "tsl/lib/io/buffered_inputstream.h"
 #include "tsl/lib/io/random_inputstream.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 using std::string;
 
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index f0dce33a38533a..732384eca9dada 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -32,8 +32,8 @@ xla_cc_binary(
         "//xla/service:cpu_plugin",
         "//xla/service:gpu_plugin",
         "//xla/service:interpreter_plugin",
+        "//xla/tsl/util:command_line_flags",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda(["//xla/stream_executor/cuda:cublas_plugin"]),
 )
 
diff --git a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
index fda3fb25968ac2..73b018323f34c3 100644
--- a/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
+++ b/third_party/xla/xla/tools/hlo_bisect/hlo_bisect.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "xla/tools/hlo_bisect/hlo_bisect_utils.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
-#include "tsl/util/command_line_flags.h"
 
 const char* const kUsage = R"(
 Given an HloModule that manifests an XLA bug, either crashes the compiler or
diff --git a/third_party/xla/xla/tools/hlo_decomposer.cc b/third_party/xla/xla/tools/hlo_decomposer.cc
index a5a7c5d23e7b89..005355d671fe3c 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.cc
+++ b/third_party/xla/xla/tools/hlo_decomposer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -116,6 +117,41 @@ absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
   return modules;
 }
 
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const std::vector<HloInstruction*>& instructions) {
+  CHECK(!instructions.empty());
+  HloInstruction& first_instruction = *instructions[0];
+  auto new_hlo_module = std::make_unique<HloModule>(
+      first_instruction.GetModule()->name() + "_collective_ops",
+      HloModuleConfig{},
+      std::make_unique<CompilationEnvironments>(
+          first_instruction.GetModule()->comp_envs()));
+  int parameter_number = 0;
+  HloComputation::Builder builder("entry_computation");
+  HloCloneContext clone_context(new_hlo_module.get());
+  std::vector<HloInstruction*> new_instructions;
+  for (auto* hlo : instructions) {
+    std::vector<HloInstruction*> new_operands;
+    for (const HloInstruction* operand : hlo->operands()) {
+      std::unique_ptr<HloInstruction> new_parameter =
+          HloInstruction::CreateParameter(parameter_number, operand->shape(),
+                                          operand->name());
+      ++parameter_number;
+      new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
+    }
+    std::unique_ptr<HloInstruction> new_instruction =
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands, &clone_context);
+    new_instructions.push_back(
+        builder.AddInstruction(std::move(new_instruction)));
+  }
+
+  std::unique_ptr<HloInstruction> tuple_instruction =
+      HloInstruction::CreateTuple(new_instructions);
+  builder.AddInstruction(std::move(tuple_instruction));
+  new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
+  return new_hlo_module;
+}
+
 std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
     const HloInstruction& hlo) {
   auto new_hlo_module = std::make_unique<HloModule>(
diff --git a/third_party/xla/xla/tools/hlo_decomposer.h b/third_party/xla/xla/tools/hlo_decomposer.h
index 9c449066906e05..d12b4d82216d1e 100644
--- a/third_party/xla/xla/tools/hlo_decomposer.h
+++ b/third_party/xla/xla/tools/hlo_decomposer.h
@@ -38,6 +38,13 @@ absl::StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
 std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
     const HloInstruction& hlo);
 
+// Extracts HLO instructions into a new HLO module replacing all operands
+// with parameter instructions even if the result of one instruction is used
+// as a parameter to another. Combines results of all operations into the
+// tuple and adds this tuple as a root instruction of the new module.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const std::vector<HloInstruction*>& instructions);
+
 // Extracts producer and consumer HLO instruction into a new HLO module
 // replacing its operands with parameter instructions.
 std::unique_ptr<HloModule> ExtractProducerConsumerIntoNewModule(
diff --git a/third_party/xla/xla/tools/hlo_expand.cc b/third_party/xla/xla/tools/hlo_expand.cc
index 70aed16de04941..cd568564339d27 100644
--- a/third_party/xla/xla/tools/hlo_expand.cc
+++ b/third_party/xla/xla/tools/hlo_expand.cc
@@ -27,8 +27,8 @@ limitations under the License.
 #include "xla/service/sharding_propagation.h"
 #include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
 #include "xla/service/triangular_solve_expander.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tools/hlo_expand.h b/third_party/xla/xla/tools/hlo_expand.h
index a80f05b5e789a6..5c1818e91a81da 100644
--- a/third_party/xla/xla/tools/hlo_expand.h
+++ b/third_party/xla/xla/tools/hlo_expand.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/service/hlo_pass_pipeline.h"
-#include "tsl/util/command_line_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/tools/hlo_expand_main.cc b/third_party/xla/xla/tools/hlo_expand_main.cc
index 4e34c9f6c37fed..60a83c3b55837c 100644
--- a/third_party/xla/xla/tools/hlo_expand_main.cc
+++ b/third_party/xla/xla/tools/hlo_expand_main.cc
@@ -25,11 +25,11 @@ limitations under the License.
 #include "xla/service/hlo_pass_pipeline.h"
 #include "xla/tools/hlo_expand.h"
 #include "xla/tools/hlo_module_loader.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/path.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 3bd009b55481d8..1487bc0a484d84 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -1,8 +1,3 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-load(
-    "//xla/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
@@ -18,6 +13,11 @@ load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
+load(
+    "//xla/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -117,6 +117,7 @@ cc_library(
         "//xla/service:platform_util",
         "//xla/tools:hlo_module_loader",
         "//xla/tools:run_hlo_module_lib",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -128,7 +129,6 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_gpu_is_configured([
         ":gpu_opt",
     ]) + if_cuda_is_configured([
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index 86bc5b0f281c84..ef24a0a4da2405 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
 #include "xla/tools/run_hlo_module.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/init_main.h"
@@ -48,7 +49,6 @@ limitations under the License.
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
@@ -209,7 +209,8 @@ int main(int argc, char** argv) {
                 "\t\t\t * llvm : LLVM IR\n"
                 "\t\t\t * ptx : PTX dump\n"
                 "\t\t\t * buffer-assignment: Buffer Assignment\n"
-                "\t\t\t * hlo-backend: HLO after backend passes\n"),
+                "\t\t\t * hlo-backend: HLO after backend passes\n"
+                "\t\t\t * html: HTML dump\n"),
       tsl::Flag("list-stages", &opts.list_stages,
                 "Print all supported stages for a given platform and exit"),
       tsl::Flag("split-input-file", &opts.split_input_file,
diff --git a/third_party/xla/xla/tools/hlo_proto_to_json.cc b/third_party/xla/xla/tools/hlo_proto_to_json.cc
index ef16a966edcd7f..cdadd1e3ce1d79 100644
--- a/third_party/xla/xla/tools/hlo_proto_to_json.cc
+++ b/third_party/xla/xla/tools/hlo_proto_to_json.cc
@@ -30,12 +30,12 @@ limitations under the License.
 
 #include "xla/service/hlo.pb.h"
 #include "xla/statusor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
-#include "tsl/util/command_line_flags.h"
 
 using std::string;
 
diff --git a/third_party/xla/xla/tools/interactive_graphviz.cc b/third_party/xla/xla/tools/interactive_graphviz.cc
index 6bd32a96427f8d..10d68162d54627 100644
--- a/third_party/xla/xla/tools/interactive_graphviz.cc
+++ b/third_party/xla/xla/tools/interactive_graphviz.cc
@@ -46,12 +46,12 @@ limitations under the License.
 #include "xla/service/local_service.h"
 #include "xla/service/platform_util.h"
 #include "xla/tools/hlo_extractor.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/subprocess.h"
 #include "tsl/protobuf/error_codes.pb.h"
-#include "tsl/util/command_line_flags.h"
 #if defined(PLATFORM_GOOGLE)
 #include "util/readline/readline.h"
 #endif
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index aecf96c53d43e3..1a2ac425ed166c 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -1,10 +1,10 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_binary")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_binary")
+load("//xla/tests:build_defs.bzl", "xla_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -40,13 +40,13 @@ xla_cc_binary(
         "//xla:statusor",
         "//xla/pjrt:pjrt_client",
         "//xla/service:cpu_plugin",
+        "//xla/tsl/util:command_line_flags",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/util:command_line_flags",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
     ]) + if_cuda([
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
index 6a005380ad62c4..b0666ff491fcba 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/hlo_runner_main.cc
@@ -28,11 +28,11 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/tools/multihost_hlo_runner/functional_hlo_runner.h"
 #include "xla/tools/multihost_hlo_runner/hlo_runner_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index 6f4005a2da5d9c..c19130df2c6561 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -26,11 +26,11 @@ limitations under the License.
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
 #include "xla/tools/run_hlo_module.h"
+#include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
 #include "tsl/platform/test.h"
-#include "tsl/util/command_line_flags.h"
 
 namespace {
 const char* const kUsage = R"(
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
index aad084173418de..21f67f91401763 100644
--- a/third_party/xla/xla/translate/BUILD
+++ b/third_party/xla/xla/translate/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load("//xla:xla.bzl", "xla_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index 2466eaf6915dc1..ad5ad86b6b3518 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -1,6 +1,6 @@
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -94,6 +94,7 @@ cc_library(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:errors",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index 62c88294fa449c..0e234bcdb7c60d 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -575,7 +575,12 @@ absl::StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
         auto new_operation,
         ImportInstructionWithLayout(instruction, operands, builder));
     if (new_operation) {
-      instruction_value_map_[instruction] = new_operation->getResult(0);
+      unsigned int idx =
+          (instruction->opcode() == HloOpcode::kRngBitGenerator &&
+           instruction->shape().IsArray())
+              ? 1
+              : 0;
+      instruction_value_map_[instruction] = new_operation->getResult(idx);
     }
   }
 
@@ -1643,18 +1648,44 @@ absl::StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
     }
     case HloOpcode::kRngBitGenerator: {
+      // HloRngBitGeneratorInstruction can have two kinds of shapes, (1)
+      // tuple(output_state, output_data), and (2) output_data.
+      // mhlo::RngBitGeneratorOp has only one shape, (output_state,
+      // output_data).
       auto rng_op = Cast<HloRngBitGeneratorInstruction>(instruction);
 
+      auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
+          builder_->getContext(),
+          *mlir::mhlo::symbolizeRngAlgorithm(rng_op->algorithm()));
+      attributes.push_back(
+          builder_->getNamedAttr("rng_algorithm", algorithm_attr));
+
       // Flatten the return type if they are tuple-typed.
       llvm::SmallVector<Type> flattened_ret_types;
       FlattenTupleType(result_type, flattened_ret_types);
+      if (rng_op->shape().IsArray()) {
+        TF_ASSIGN_OR_RETURN(auto state_type,
+                            ConvertShapeToType<RankedTensorType>(
+                                rng_op->operand(0)->shape(), *builder_));
+        flattened_ret_types.insert(flattened_ret_types.begin(), state_type);
+
+        if (instruction->has_sharding()) {
+          Shape tuple_shape = ShapeUtil::MakeTupleShape(
+              {rng_op->operand(0)->shape(), instruction->shape()});
+          HloSharding tuple_sharding = HloSharding::Tuple(
+              tuple_shape, {HloSharding::Replicate(), instruction->sharding()});
+          CHECK_EQ(attributes.front().getName().str(), kShardingAttr);
+          attributes.front() = builder_->getNamedAttr(
+              kShardingAttr, ConvertSharding(tuple_sharding, builder_));
+        }
+      }
+      CHECK_EQ(flattened_ret_types.size(), 2);
 
-      auto algorithm_attr = mlir::mhlo::RngAlgorithmAttr::get(
-          builder_->getContext(),
-          *mlir::mhlo::symbolizeRngAlgorithm(rng_op->algorithm()));
       auto op = func_builder->create<mlir::mhlo::RngBitGeneratorOp>(
-          loc, flattened_ret_types, algorithm_attr, operands[0]);
-
+          loc, flattened_ret_types, operands[0], attributes);
+      if (rng_op->shape().IsArray()) {
+        return op.getOperation();
+      }
       return CreateTupleFromOpResults(func_builder, loc, op.getOperation(),
                                       result_type);
     }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index dc9de36c5a7134..d8995bd5bdd70a 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -1723,14 +1723,29 @@ add {
   ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
 }
 
-// CHECK-LABEL:  func private @rngbitgen
-// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>)
-%rngbitgen (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
+// CHECK-LABEL:  func private @rngbitgen_tuple_shape
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tuple<tensor<3xui64>, tensor<2x2xui32>> {mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"})
+%rngbitgen_tuple_shape (Arg_0.1: u64[3]) -> (u64[3], u32[2,2]) {
   %Arg_0.1 = u64[3] parameter(0)
-  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]]) 
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{maximal device=0}, {maximal device=1}}"
+  // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<PHILOX>
+  // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
   // CHECK: %[[TUPLE:.+]] = mhlo.tuple %[[RNG0]], %[[RNG1]] {xla_shape = "(u64[3]{0}, u32[2,2]{1,0})"} : tuple<tensor<3xui64>, tensor<2x2xui32>>
   // CHECK: return %[[TUPLE]]
-  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
+  ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox, sharding={{maximal device=0}, {maximal device=1}}
+}
+
+// CHECK-LABEL:  func private @rngbitgen_array_shape
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3xui64>) -> (tensor<2x2xui32> {mhlo.sharding = "{maximal device=0}"})
+%rngbitgen_array_shape (Arg_0.1: u64[3]) -> u32[2,2] {
+  %Arg_0.1 = u64[3] parameter(0)
+  // CHECK: %[[RNG0:.+]], %[[RNG1:.+]] = "mhlo.rng_bit_generator"(%[[ARG0]])
+  // CHECK-SAME: mhlo.sharding = "{{\{}}{replicated}, {maximal device=0}}"
+  // CHECK-SAME: rng_algorithm = #mhlo.rng_algorithm<DEFAULT>
+  // CHECK-SAME: (tensor<3xui64>) -> (tensor<3xui64>, tensor<2x2xui32>)
+  // CHECK: return %[[RNG1]]
+  ROOT %rng-bit-generator.2 = u32[2,2] rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_default, sharding={maximal device=0}
 }
 
 // CHECK-LABEL:  func private @cbrt
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index d3bb42f5ce17f4..cb94fd2ce960fa 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -1,9 +1,9 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
+load("//xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
index b3d8b93ddfa748..21dfb491ec6e9b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -58,7 +58,7 @@ ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
 
 // Convert replica group from MLIR encoding to HLO.
 // See HloFunctionImporter::ConvertReplicaGroups for the MLIR encoding.
-StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
+absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input) {
   mlir::RankedTensorType type =
       input.getType().dyn_cast<mlir::RankedTensorType>();
@@ -85,7 +85,7 @@ StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
 
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
-StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
     std::optional<mlir::DenseIntElementsAttr> optional_attr) {
   if (!optional_attr.has_value())
     return std::vector<std::pair<int64_t, int64_t>>{};
@@ -105,7 +105,7 @@ StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
   return out;
 }
 
-StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
+absl::StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
   std::optional<mlir::mhlo::FftType> type =
       mlir::mhlo::symbolizeEnum<mlir::mhlo::FftType>(type_string);
   if (!type) return InvalidArgument("Unknown FFT type %s", type_string.str());
@@ -124,7 +124,7 @@ StatusOr<FftType> ConvertFftType(llvm::StringRef type_string) {
   }
 }
 
-StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
+absl::StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string) {
   std::optional<mlir::mhlo::Transpose> transpose =
       mlir::mhlo::symbolizeTranspose(transpose_string);
@@ -145,7 +145,7 @@ StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
   }
 }
 
-StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
     mlir::mhlo::CustomCallSchedule schedule) {
   switch (schedule) {
     case mlir::mhlo::CustomCallSchedule::NONE:
@@ -160,7 +160,7 @@ StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
   }
 }
 
-StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version) {
   switch (api_version) {
     case mlir::mhlo::CustomCallApiVersion::API_VERSION_UNSPECIFIED:
@@ -179,7 +179,8 @@ StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
   }
 }
 
-StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+absl::StatusOr<
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
   std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>> aliasInfo;
   for (auto attr : aliasArrayAttr.getValue()) {
@@ -196,7 +197,8 @@ ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
 std::optional<xla::OpSharding> ConvertSharding(llvm::StringRef sharding) {
   xla::OpSharding sharding_proto;
   if (sharding_proto.ParseFromString(sharding.str())) return sharding_proto;
-  StatusOr<xla::HloSharding> sharding_cpp = xla::ParseSharding(sharding.str());
+  absl::StatusOr<xla::HloSharding> sharding_cpp =
+      xla::ParseSharding(sharding.str());
   if (sharding_cpp.ok()) return sharding_cpp->ToProto();
   return std::nullopt;
 }
@@ -248,7 +250,7 @@ DotDimensionNumbers ConvertDotDimensionNumbers(
   return output;
 }
 
-StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
+absl::StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
     const mlir::ArrayAttr& array) {
   int rank = array.size();
   std::vector<int64_t> converted_array(rank);
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
index 7cc8734ad81116..96aa716b588628 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/attribute_exporter.h
@@ -33,25 +33,26 @@ namespace xla {
 ConvolutionDimensionNumbers ConvertConvDimensionNumbers(
     mlir::mhlo::ConvDimensionNumbersAttr input);
 
-StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
+absl::StatusOr<std::vector<ReplicaGroup>> ConvertReplicaGroups(
     mlir::DenseIntElementsAttr input);
 
 // Convert a (N, 2) dense attribute to a list of tuples. This is the way padding
 // and source-target pairs are defined in HLO.
-StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
+absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> ConvertNx2Attribute(
     std::optional<mlir::DenseIntElementsAttr> optional_attr);
 
-StatusOr<FftType> ConvertFftType(llvm::StringRef type_string);
-StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
+absl::StatusOr<FftType> ConvertFftType(llvm::StringRef type_string);
+absl::StatusOr<TriangularSolveOptions::Transpose> ConvertTranspose(
     llvm::StringRef transpose_string);
 
-StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
+absl::StatusOr<xla::CustomCallSchedule> ConvertCustomCallSchedule(
     mlir::mhlo::CustomCallSchedule schedule);
 
-StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
+absl::StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
     mlir::mhlo::CustomCallApiVersion api_version);
 
-StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
+absl::StatusOr<
+    std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
 
 // Returns an OpSharding that represents the result of parsing the given string:
@@ -67,7 +68,7 @@ DotDimensionNumbers ConvertDotDimensionNumbers(
     absl::Span<const int64_t> rhs_batch,
     absl::Span<const int64_t> rhs_contract);
 
-StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
+absl::StatusOr<std::vector<int64_t>> ConvertMlirArrayAttrToInt64Array(
     const mlir::ArrayAttr& array);
 }  // namespace xla
 #endif  // XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
index f04de43383760e..5d8a128f7b231f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.cc
@@ -63,7 +63,7 @@ xla::Status RewriteLayoutWithShardedShape(
 
 // There is a shape_representation_fn or sharding for an output, this function
 // uses a reshape to fix the layout.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
@@ -105,6 +105,7 @@ xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
       to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
     }
   }
+  xla::XlaScopedShardingAssignment scoped_sharding(builder, sharding);
   return xla::Reshape(to_shape, original);
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
index 5db4ea2f6ff26c..2e72c6d36fca68 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/layout_util.h
@@ -52,11 +52,11 @@ enum class XlaLayoutPreference {
 // The following defines the layout preference of an xla tensor.
 // The return value of LayoutPreferenceFn can be used in
 // ShapeRepresentationFn.
-typedef std::function<xla::StatusOr<XlaLayoutPreference>(
+typedef std::function<absl::StatusOr<XlaLayoutPreference>(
     const xla::Shape& shape)>
     LayoutPreferenceFn;
 
-typedef std::function<xla::StatusOr<xla::Shape>(
+typedef std::function<absl::StatusOr<xla::Shape>(
     const xla::Shape& shape, bool fast_mem,
     XlaLayoutPreference layout_preference)>
     ShapeRepresentationFn;
@@ -73,7 +73,7 @@ xla::Status RewriteLayoutWithShardedShape(
 
 // Adds reshapes to fix the layout of an output, if a shape_representation_fn or
 // sharding is present.
-xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+absl::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
     xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
     const LayoutPreferenceFn& layout_preference_fn,
     const ShapeRepresentationFn& shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 5df73a64c517ab..8911c7864ac83e 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 
 #include <algorithm>
+#include <cassert>
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -184,8 +185,8 @@ xla::Array<T> ArrayFromDenseElementsAttr(mlir::DenseElementsAttr dense_attr) {
   return array;
 }
 
-StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
-                                                  xla::Layout layout) {
+absl::StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
+                                                        xla::Layout layout) {
   auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
   if (!dense_attr)
     return tsl::errors::Unimplemented("Only dense elements attr are supported");
@@ -193,7 +194,7 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
   xla::Shape shape = xla::TypeToShape(dense_attr.getType());
 
   return xla::primitive_util::PrimitiveTypeSwitch<StatusOr<xla::Literal>>(
-      [&](auto primitive_type_constant) -> StatusOr<xla::Literal> {
+      [&](auto primitive_type_constant) -> absl::StatusOr<xla::Literal> {
         if constexpr (xla::primitive_util::IsArrayType(
                           primitive_type_constant)) {
           using cpp_type =
@@ -832,6 +833,25 @@ bool SimplyReturnedOp(mlir::Operation* op) {
   return false;
 }
 
+void BuildGetTupleElementsForTupleResults(mlir::Operation* op, xla::XlaOp tuple,
+                                          OpLoweringContext ctx) {
+  const std::optional<xla::OpSharding>& tuple_sharding =
+      ctx.builder->sharding();
+  if (tuple_sharding.has_value()) {
+    assert(op->getNumResults() == tuple_sharding->tuple_shardings_size());
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      xla::XlaScopedShardingAssignment scoped_sharding(
+          ctx.builder, tuple_sharding->tuple_shardings(index));
+      (*ctx.values)[result] = xla::GetTupleElement(tuple, index);
+    }
+  } else {
+    xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder, std::nullopt);
+    for (auto [index, result] : llvm::enumerate(op->getResults())) {
+      (*ctx.values)[result] = xla::GetTupleElement(tuple, index);
+    }
+  }
+}
+
 }  // namespace
 
 namespace mlir {
@@ -995,9 +1015,7 @@ LogicalResult ExportXlaOp(AllGatherOp op, OpLoweringContext ctx) {
         Convert_replica_groups(op.getReplicaGroups()),
         Convert_channel_handle(op.getChannelHandle()), layout,
         Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     value_map[op->getResults()[0]] = xla::AllGather(
         operands[0], all_gather_dim, shard_count,
@@ -1030,9 +1048,7 @@ LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
         operands, computation, Convert_replica_groups(op.getReplicaGroups()),
         Convert_channel_handle(op.getChannelHandle()), shape_with_layout,
         Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     value_map[op->getResults()[0]] = xla::AllReduce(
         operands[0], computation, Convert_replica_groups(op.getReplicaGroups()),
@@ -1061,9 +1077,7 @@ LogicalResult ExportXlaOp(AllToAllOp op, OpLoweringContext ctx) {
     auto tuple = xla::AllToAllTuple(
         operands, Convert_replica_groups(op.getReplicaGroups()), layout,
         Convert_channel_handle(op.getChannelHandle()));
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(tuple, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, tuple, ctx);
   } else {
     // ArrayAllToAll always has exactly one operand (checked in the verifier).
     value_map[op->getResults()[0]] = xla::AllToAll(
@@ -1361,11 +1375,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
     if (op.getNumResults() == 1) {
       value_map[op.getResult(0)] = xla_recv;
     } else {
-      xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder,
-                                                       std::nullopt);
-      for (const auto& item : llvm::enumerate(op.getResults())) {
-        value_map[item.value()] = xla::GetTupleElement(xla_recv, item.index());
-      }
+      BuildGetTupleElementsForTupleResults(op, xla_recv, ctx);
     }
     return success();
   }
@@ -1384,9 +1394,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = exportedOp;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(exportedOp, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, exportedOp, ctx);
   }
   return success();
 }
@@ -1600,9 +1608,7 @@ LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = ifop;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(ifop, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, ifop, ctx);
   }
 
   return success();
@@ -1657,9 +1663,7 @@ LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = caseop;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(caseop, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, caseop, ctx);
   }
   return success();
 }
@@ -2013,9 +2017,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
                               reduction_dim, comparator, recall_target,
                               aggregate_to_topk, reduction_input_size_override);
     }
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(cc_op, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, cc_op, ctx);
     return success();
   }
 
@@ -2056,7 +2058,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
     }
   }
 
-  StatusOr<xla::Literal> literal;
+  absl::StatusOr<xla::Literal> literal;
   const xla::Literal* literal_ptr = nullptr;
   auto literal_attr = op->getAttrOfType<DenseElementsAttr>(kLiteralAttr);
   if (literal_attr) {
@@ -2115,9 +2117,7 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   if (op->getNumResults() == 1) {
     value_map[op.getResult(0)] = custom_call;
   } else {
-    for (auto [index, result] : llvm::enumerate(op.getResults())) {
-      value_map[result] = xla::GetTupleElement(custom_call, index);
-    }
+    BuildGetTupleElementsForTupleResults(op, custom_call, ctx);
   }
 
   return success();
@@ -2312,9 +2312,7 @@ LogicalResult ExportXlaOp(ReduceOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = result;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(result, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
   return success();
 }
@@ -2342,9 +2340,7 @@ LogicalResult ExportXlaOp(ReduceWindowOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     value_map[op.getResult(0)] = result;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      value_map[item.value()] = xla::GetTupleElement(result, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
   return success();
 }
@@ -2374,9 +2370,7 @@ LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
       static_cast<xla::RandomAlgorithm>(op.getRngAlgorithm()),
       Unwrap(xla_arg_1), xla::TypeToShape(results[1].getType()));
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
-
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
   return mlir::success();
 }
 
@@ -2391,7 +2385,6 @@ LogicalResult ExportXlaOp(XlaRngGetAndUpdateStateOp op, OpLoweringContext ctx) {
 
 LogicalResult ExportXlaOp(BatchNormGradOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  auto results = op.getResults();
 
   xla::XlaOp operand, scale, mean, variance, grad_output;
   if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
@@ -2407,15 +2400,13 @@ LogicalResult ExportXlaOp(BatchNormGradOp op, OpLoweringContext ctx) {
       xla::BatchNormGrad(operand, scale, mean, variance, grad_output,
                          ConvertAPFloat(op.getEpsilon()), op.getFeatureIndex());
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
 
   return mlir::success();
 }
 
 LogicalResult ExportXlaOp(BatchNormTrainingOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
-  auto results = op.getResults();
 
   xla::XlaOp operand, scale, offset;
   if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
@@ -2428,8 +2419,7 @@ LogicalResult ExportXlaOp(BatchNormTrainingOp op, OpLoweringContext ctx) {
                                            ConvertAPFloat(op.getEpsilon()),
                                            op.getFeatureIndex());
 
-  for (const auto& item : llvm::enumerate(results))
-    value_map[item.value()] = xla::GetTupleElement(xla_result, item.index());
+  BuildGetTupleElementsForTupleResults(op, xla_result, ctx);
 
   return mlir::success();
 }
@@ -2478,9 +2468,7 @@ LogicalResult ExportXlaOp(ScatterOp op, OpLoweringContext ctx) {
   }
 
   // mhlo.ScatterOp supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(scatter_op, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, scatter_op, ctx);
 
   return success();
 }
@@ -2607,9 +2595,7 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
   }
 
   // MLIR's sort supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(sorted, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, sorted, ctx);
   return success();
 }
 
@@ -2672,9 +2658,7 @@ LogicalResult ExportXlaOp(WhileOp op, OpLoweringContext ctx) {
   }
 
   // mhlo.WhileOp supports multiple returns, untuple all the results of XLA's.
-  for (const auto& it : llvm::enumerate(op.getResults())) {
-    value_map[it.value()] = xla::GetTupleElement(whileop, it.index());
-  }
+  BuildGetTupleElementsForTupleResults(op, whileop, ctx);
 
   return success();
 }
@@ -2693,10 +2677,7 @@ LogicalResult ExportXlaOp(OptimizationBarrierOp op, OpLoweringContext ctx) {
         xla::OptimizationBarrier(operands[0]);
   } else {
     auto result = xla::OptimizationBarrier(Tuple(ctx.builder, operands));
-
-    for (const auto& it : llvm::enumerate(op.getResults())) {
-      value_map[it.value()] = xla::GetTupleElement(result, it.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, result, ctx);
   }
 
   return success();
@@ -2729,9 +2710,7 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
   if (op.getNumResults() == 1) {
     values[op.getResult(0)] = fusion;
   } else {
-    for (const auto& item : llvm::enumerate(op.getResults())) {
-      values[item.value()] = xla::GetTupleElement(fusion, item.index());
-    }
+    BuildGetTupleElementsForTupleResults(op, fusion, ctx);
   }
   return success();
 }
@@ -2790,9 +2769,7 @@ LogicalResult ExportXlaOp(TopKOp op, OpLoweringContext ctx) {
   auto topk = xla::TopK(operand, op.getK(), op.getLargest());
 
   // Untuple the two results of XLA's topk.
-  for (const auto& [index, value] : llvm::enumerate(op.getResults())) {
-    value_map[value] = xla::GetTupleElement(topk, index);
-  }
+  BuildGetTupleElementsForTupleResults(op, topk, ctx);
   return success();
 }
 
@@ -3164,7 +3141,7 @@ LogicalResult ConvertToHloModule::Lower(
         if (!is_entry_function || !has_ret_shardings) continue;
 
         xla::Shape return_shape = xla::TypeToShape(ret.get().getType());
-        StatusOr<xla::XlaOp> reshape =
+        absl::StatusOr<xla::XlaOp> reshape =
             ReshapeWithCorrectRepresentationAndSharding(
                 builder, returns[index], return_shape,
                 options_.layout_preference_fn, options_.shape_representation_fn,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
index 46548c0f647d91..140d5bd9093576 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -1,5 +1,5 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
+load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index e44e994a5f23a8..0a49b55a59c879 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -2173,10 +2173,8 @@ func.func @main(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) {
 // CHECK-SAME:  sharding={
 // CHECK-SAME:    {maximal device=0}, {maximal device=0}
 // CHECK-SAME:  }
-// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0
-// CHECK-NOT: sharding=
-// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1
-// CHECK-NOT: sharding=
+// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0, sharding={maximal device=0}
+// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1, sharding={maximal device=0}
 // CHECK:  ROOT {{%.*}} = (s32[3,4], token[]) tuple(s32[3,4] [[TUPLE0]], token[] [[TUPLE1]])
 
 // -----
@@ -2855,9 +2853,12 @@ func.func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
 func.func @main(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>) {
 // CHECK: %[[ARG0:.*]] = f32[4,4] parameter(0)
 // CHECK: %[[ARG1:.*]] = f32[3,4] parameter(1)
-// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]])
-// CHECK: %[[RESULT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]])
-  %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
+// CHECK: %[[ARGS:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[ARG0]], f32[3,4] %[[ARG1]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[OPT:.*]] = (f32[4,4], f32[3,4]) opt-barrier((f32[4,4], f32[3,4]) %[[ARGS]]), sharding={{\{}}{replicated}, {devices=[1,2]<=[2]}}
+// CHECK: %[[GTE0:.*]] = f32[4,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=0, sharding={replicated}
+// CHECK: %[[GTE1:.*]] = f32[3,4] get-tuple-element((f32[4,4], f32[3,4]) %[[OPT]]), index=1, sharding={devices=[1,2]<=[2]}
+// CHECK: ROOT %[[ROOT:.*]] = (f32[4,4], f32[3,4]) tuple(f32[4,4] %[[GTE0]], f32[3,4] %[[GTE1]])
+  %0, %1 = "mhlo.optimization_barrier"(%arg0, %arg1) {mhlo.sharding = "{{replicated}, {devices=[1,2]<=[2]}}"} : (tensor<4x4xf32>, tensor<3x4xf32>) -> (tensor<4x4xf32>, tensor<3x4xf32>)
   func.return %0, %1 : tensor<4x4xf32>, tensor<3x4xf32>
 }
 
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
index db19b7b24f8b6b..a4a2650d61b69b 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/sharding.mlir
@@ -31,8 +31,9 @@ func.func @main(%arg0: tensor<5x8x128xf32> {mhlo.sharding = "\08\03\1A\03\01\02\
 // CHECK-LABEL: ENTRY %main.{{.*}} ({{[^,]*}}: f32[4,4]) -> (f32[4,4], f32[4,4])
 func.func @main(%arg0: tensor<4x4xf32>) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\03\02\01\02\22\04\00\01\02\03B\01\00"}, tensor<4x4xf32>) {
   // CHECK-NEXT: %Arg_0.1 = f32[4,4] parameter(0)
-  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NEXT: [[RESHAPE_0:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1), sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
   // CHECK-NEXT: [[RESHAPE_1:%.*]] = f32[4,4] reshape(f32[4,4] %Arg_0.1)
+  // CHECK-NOT:  sharding
   // CHECK-NEXT: ROOT {{%.*}} = (f32[4,4], f32[4,4]) tuple(f32[4,4] [[RESHAPE_0]], f32[4,4] [[RESHAPE_1]])
   // CHECK-SAME: sharding={{\{}}{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}, {replicated}}
   return %arg0, %arg0 : tensor<4x4xf32>, tensor<4x4xf32>
@@ -49,3 +50,20 @@ func.func @main() -> (tensor<12x24x36xf32>) {
   %1 = mhlo.add %0, %0 : tensor<12x24x36xf32>
   return %1 : tensor<12x24x36xf32>
 }
+
+// -----
+
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: u64[2]) -> (u64[2], u32[512,4])
+func.func @main(%arg0: tensor<2xui64>) -> (tensor<2xui64> {mhlo.sharding = "{devices=[2,16]<=[32] last_tile_dim_replicate}"}, tensor<512x4xui32> {mhlo.sharding = "{devices=[4,8]<=[32]}"}) {
+  // CHECK-NEXT: %Arg_0.1 = u64[2] parameter(0)
+  // CHECK-NEXT: %rng-bit-generator.2 = (u64[2], u32[512,4]) rng-bit-generator(u64[2] %Arg_0.1), algorithm=rng_default, sharding={{\{}}{replicated}, {devices=[8,4]<=[32]}}
+  // CHECK-NEXT: %get-tuple-element.3 = u64[2] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=0, sharding={replicated}
+  // CHECK-NEXT: %add.5 = u64[2] add(u64[2] %get-tuple-element.3, u64[2] %get-tuple-element.3)
+  // CHECK-NEXT: %reshape.6 = u64[2] reshape(u64[2] %add.5)
+  // CHECK-NEXT: %get-tuple-element.4 = u32[512,4] get-tuple-element((u64[2], u32[512,4]) %rng-bit-generator.2), index=1, sharding={devices=[8,4]<=[32]}
+  // CHECK-NEXT: %reshape.7 = u32[512,4] reshape(u32[512,4] %get-tuple-element.4)
+  // CHECK-NEXT: ROOT %tuple.8 = (u64[2], u32[512,4]) tuple(u64[2] %reshape.6, u32[512,4] %reshape.7), sharding={{\{}}{devices=[2,16]<=[32] last_tile_dim_replicate}, {devices=[4,8]<=[32]}}
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {mhlo.sharding = "{{replicated}, {devices=[8,4]<=[32]}}", rng_algorithm = #mhlo.rng_algorithm<DEFAULT>} : (tensor<2xui64>) -> (tensor<2xui64>, tensor<512x4xui32>)
+  %0 = mhlo.add %output_state, %output_state : tensor<2xui64>
+  return %0, %output : tensor<2xui64>, tensor<512x4xui32>
+}
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
index b78aaba15bb75a..409bab6c722cef 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/translate.cc
@@ -43,7 +43,7 @@ mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
   return mlir::success();
 }
 
-StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
+absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
     const HloProto& hlo_proto) {
   const HloModuleProto& module_proto = hlo_proto.hlo_module();
   TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
index 6739edc952a82d..36ef41a55ac78f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/type_to_shape_test.cc
@@ -154,7 +154,7 @@ TEST(TypeToShapeTest, ConvertMemRefToShape) {
   MLIRContext context;
   mlir::Builder builder(&context);
 
-  StatusOr<mlir::Type> mlir_type =
+  absl::StatusOr<mlir::Type> mlir_type =
       ConvertShapeToType<MemRefType>(shape, builder);
   ASSERT_TRUE(mlir_type.ok());
   mlir::Type type = std::move(mlir_type).value();
@@ -171,7 +171,7 @@ TEST(TypeToShapeTest, ConvertMemRefToShape2) {
   MLIRContext context;
   mlir::Builder builder(&context);
 
-  StatusOr<mlir::Type> mlir_type =
+  absl::StatusOr<mlir::Type> mlir_type =
       ConvertShapeToType<MemRefType>(shape, builder);
   ASSERT_TRUE(mlir_type.ok());
   mlir::Type type = std::move(mlir_type).value();
diff --git a/third_party/xla/xla/tsl/c/tsl_status.cc b/third_party/xla/xla/tsl/c/tsl_status.cc
index 4b90c2a2b81660..d6f86a71aa7aea 100644
--- a/third_party/xla/xla/tsl/c/tsl_status.cc
+++ b/third_party/xla/xla/tsl/c/tsl_status.cc
@@ -31,7 +31,7 @@ void TSL_DeleteStatus(TSL_Status* s) { delete s; }
 
 void TSL_SetStatus(TSL_Status* s, TSL_Code code, const char* msg) {
   if (code == TSL_OK) {
-    s->status = ::tsl::OkStatus();
+    s->status = absl::OkStatus();
     return;
   }
   s->status =
diff --git a/third_party/xla/xla/tsl/c/tsl_status_internal.h b/third_party/xla/xla/tsl/c/tsl_status_internal.h
index 43ea9e743331e2..132adc62dac66f 100644
--- a/third_party/xla/xla/tsl/c/tsl_status_internal.h
+++ b/third_party/xla/xla/tsl/c/tsl_status_internal.h
@@ -22,7 +22,7 @@ limitations under the License.
 // and should not be depended on.
 
 struct TSL_Status {
-  tsl::Status status;
+  absl::Status status;
 };
 
 #endif  // XLA_TSL_C_TSL_STATUS_INTERNAL_H_
diff --git a/third_party/xla/xla/tsl/cuda/stub.bzl b/third_party/xla/xla/tsl/cuda/stub.bzl
index d5e644dc13c97c..1aaa52746d69b8 100644
--- a/third_party/xla/xla/tsl/cuda/stub.bzl
+++ b/third_party/xla/xla/tsl/cuda/stub.bzl
@@ -21,6 +21,7 @@ def cuda_stub(name, srcs):
         cmd = select({
             "@local_tsl//tsl:linux_aarch64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target aarch64",
             "@local_tsl//tsl:linux_x86_64": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target x86_64",
+            "@local_tsl//tsl:linux_ppc64le": "$(location //third_party/implib_so:make_stub) $< --outdir $(RULEDIR) --target powerpc64le",
             "//conditions:default": "NOT_IMPLEMENTED_FOR_THIS_PLATFORM_OR_ARCHITECTURE",
         }),
     )
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD b/third_party/xla/xla/tsl/python/lib/core/BUILD
similarity index 100%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
rename to third_party/xla/xla/tsl/python/lib/core/BUILD
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
similarity index 94%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
rename to third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
index 662a29f84f9387..d138c00bf9e6d5 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.cc
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.cc
@@ -12,18 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tsl/python/lib/core/ml_dtypes.h"
+#include "xla/tsl/python/lib/core/ml_dtypes.h"
 
 #include <atomic>
 #include <exception>
 
+// Must be included first to ensure `NPY_NO_DEPRECATED_API` is defined.
+// clang-format off
+#include "xla/tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
+// clang-format on
 #include "numpy/ndarraytypes.h"
 #include "absl/base/attributes.h"
 #include "absl/base/call_once.h"
 #include "pybind11/gil.h"  // from @pybind11
 #include "pybind11/numpy.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
-#include "tsl/python/lib/core/numpy.h"  // IWYU pragma: keep
 
 namespace tsl {
 namespace ml_dtypes {
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
rename to third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
index f2b93ebee41a21..bf9eab2200a76b 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/ml_dtypes.h
+++ b/third_party/xla/xla/tsl/python/lib/core/ml_dtypes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
-#define TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#ifndef XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#define XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
 
 // Registers all custom types from the python ml_dtypes package.
 //   https://github.com/jax-ml/ml_dtypes
@@ -47,4 +47,4 @@ inline int GetBfloat16TypeNum() { return GetNumpyDtypes().bfloat16; }
 }  // namespace ml_dtypes
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
+#endif  // XLA_TSL_PYTHON_LIB_CORE_ML_DTYPES_H_
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc b/third_party/xla/xla/tsl/python/lib/core/numpy.cc
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc
rename to third_party/xla/xla/tsl/python/lib/core/numpy.cc
index 3013a1a7c68d46..3f54df1281c2d5 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.cc
+++ b/third_party/xla/xla/tsl/python/lib/core/numpy.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // ImportNumpy function to populate it.
 #define XLA_IMPORT_NUMPY
 
-#include "tsl/python/lib/core/numpy.h"
+#include "xla/tsl/python/lib/core/numpy.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h b/third_party/xla/xla/tsl/python/lib/core/numpy.h
similarity index 91%
rename from third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h
rename to third_party/xla/xla/tsl/python/lib/core/numpy.h
index ee4b920d0ebf5c..6a5a6a6486ccf7 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/numpy.h
+++ b/third_party/xla/xla/tsl/python/lib/core/numpy.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
-#define TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#ifndef XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#define XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
 
 #ifdef PyArray_Type
 #error "Numpy cannot be included before numpy.h."
@@ -50,4 +50,4 @@ void ImportNumpy();
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_PYTHON_LIB_CORE_NUMPY_H_
+#endif  // XLA_TSL_PYTHON_LIB_CORE_NUMPY_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/BUILD b/third_party/xla/xla/tsl/util/BUILD
similarity index 77%
rename from third_party/xla/third_party/tsl/tsl/util/BUILD
rename to third_party/xla/xla/tsl/util/BUILD
index cb70feac45c080..c96aa52c81f1f7 100644
--- a/third_party/xla/third_party/tsl/tsl/util/BUILD
+++ b/third_party/xla/xla/tsl/util/BUILD
@@ -5,19 +5,19 @@
 #   to other TF components outside of TSL.
 
 load(
-    "//tsl:tsl.bzl",
+    "@local_tsl//tsl:tsl.bzl",
     "check_deps",
     "internal_visibility",
     "tsl_copts",
 )
-load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
-    "//tsl/platform:build_config_root.bzl",
-    "if_static",
+    "@local_tsl//tsl/platform:build_config.bzl",
+    "tsl_cc_test",
 )
 load(
-    "//tsl/platform:build_config.bzl",
-    "tsl_cc_test",
+    "@local_tsl//tsl/platform:build_config_root.bzl",
+    "if_static",
 )
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -127,9 +127,9 @@ cc_library(
     srcs = ["byte_swap_array.cc"],
     hdrs = ["byte_swap_array.h"],
     deps = [
-        "//tsl/platform:byte_order",
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
+        "@local_tsl//tsl/platform:byte_order",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -151,8 +151,8 @@ cc_library(
     visibility = internal_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         ":env_var",
-        "//tsl/platform:mutex",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:mutex",
     ],
     alwayslink = 1,
 )
@@ -200,14 +200,14 @@ cc_library(
     srcs = ["env_var.cc"],
     hdrs = ["env_var.h"],
     deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:logging",
-        "//tsl/platform:numbers",
-        "//tsl/platform:status",
-        "//tsl/platform:str_util",
-        "//tsl/platform:strcat",
-        "//tsl/platform:stringpiece",
-        "//tsl/platform:types",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -217,17 +217,17 @@ cc_library(
     hdrs = ["reporter.h"],
     visibility = internal_visibility([
         "//tensorflow/core:__subpackages__",
-        "//tsl:__subpackages__",
+        "@local_tsl//tsl:__subpackages__",
     ]),
     deps = [
-        "//tsl/platform:env",
-        "//tsl/platform:env_impl",
-        "//tsl/platform:errors",
-        "//tsl/platform:macros",
-        "//tsl/platform:mutex",
-        "//tsl/platform:str_util",
-        "//tsl/platform:types",
-        "//tsl/protobuf:test_log_proto_cc",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:env_impl",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:macros",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:types",
+        "@local_tsl//tsl/protobuf:test_log_proto_cc",
     ],
 )
 
@@ -242,7 +242,7 @@ cc_library(
     ],
     copts = tsl_copts(),
     visibility = internal_visibility([
-        "//tsl:internal",
+        "@local_tsl//tsl:internal",
     ]),
 )
 
@@ -251,8 +251,8 @@ tsl_cc_test(
     srcs = ["stats_calculator_test.cc"],
     deps = [
         ":stats_calculator_portable",
-        "//tsl/platform:test",
-        "//tsl/platform:test_main",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -261,9 +261,9 @@ cc_library(
     srcs = ["device_name_utils.cc"],
     hdrs = ["device_name_utils.h"],
     deps = [
-        "//tsl/platform:errors",
-        "//tsl/platform:status",
-        "//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:stringpiece",
     ],
 )
 
@@ -273,12 +273,12 @@ tsl_cc_test(
     srcs = ["device_name_utils_test.cc"],
     deps = [
         ":device_name_utils",
-        "//tsl/lib/core:status_test_util",
-        "//tsl/platform:errors",
-        "//tsl/platform:strcat",
-        "//tsl/platform:test",
-        "//tsl/platform:test_benchmark",
-        "//tsl/platform:test_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:strcat",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
@@ -287,12 +287,12 @@ cc_library(
     srcs = ["command_line_flags.cc"],
     hdrs = ["command_line_flags.h"],
     deps = [
-        "//tsl/platform:logging",
-        "//tsl/platform:str_util",
-        "//tsl/platform:stringpiece",
-        "//tsl/platform:stringprintf",
-        "//tsl/platform:types",
         "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:str_util",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/platform:stringprintf",
+        "@local_tsl//tsl/platform:types",
     ],
 )
 
@@ -311,7 +311,7 @@ filegroup(
         "onednn_threadpool.h",
     ],
     visibility = internal_visibility([
-        "@local_xla//xla:__subpackages__",
+        "//xla:__subpackages__",
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/framework:__pkg__",
         "//tensorflow/core/util:__pkg__",
diff --git a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc b/third_party/xla/xla/tsl/util/byte_swap_array.cc
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc
rename to third_party/xla/xla/tsl/util/byte_swap_array.cc
index e77e4bab8defc0..3b21798f0caf41 100644
--- a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.cc
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/byte_swap_array.h"
+#include "xla/tsl/util/byte_swap_array.h"
 
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h b/third_party/xla/xla/tsl/util/byte_swap_array.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h
rename to third_party/xla/xla/tsl/util/byte_swap_array.h
index ad7e34efcd51f7..88c87afd2696e7 100644
--- a/third_party/xla/third_party/tsl/tsl/util/byte_swap_array.h
+++ b/third_party/xla/xla/tsl/util/byte_swap_array.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
-#define TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#ifndef XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#define XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
 
 #include "tsl/platform/byte_order.h"
 #include "tsl/platform/errors.h"
@@ -101,4 +101,4 @@ Status ByteSwapArray(char *array, size_t bytes_per_elem, int array_len);
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_BYTE_SWAP_ARRAY_H_
+#endif  // XLA_TSL_UTIL_BYTE_SWAP_ARRAY_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc b/third_party/xla/xla/tsl/util/command_line_flags.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
rename to third_party/xla/xla/tsl/util/command_line_flags.cc
index 5e316e9ae9fc6a..f5a97a50eb1980 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.cc
+++ b/third_party/xla/xla/tsl/util/command_line_flags.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/command_line_flags.h"
+#include "xla/tsl/util/command_line_flags.h"
 
 #include <algorithm>
 #include <cinttypes>
diff --git a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h b/third_party/xla/xla/tsl/util/command_line_flags.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
rename to third_party/xla/xla/tsl/util/command_line_flags.h
index 2710de5753cd01..d4b3efd662a94d 100644
--- a/third_party/xla/third_party/tsl/tsl/util/command_line_flags.h
+++ b/third_party/xla/xla/tsl/util/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
-#define TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#ifndef XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#define XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
 
 #include <functional>
 #include <string>
@@ -145,4 +145,4 @@ class Flags {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_COMMAND_LINE_FLAGS_H_
+#endif  // XLA_TSL_UTIL_COMMAND_LINE_FLAGS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism.cc b/third_party/xla/xla/tsl/util/determinism.cc
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/determinism.cc
rename to third_party/xla/xla/tsl/util/determinism.cc
index b9a5abd9af40d1..6089cc96458dc1 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism.cc
+++ b/third_party/xla/xla/tsl/util/determinism.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 #include "absl/strings/string_view.h"
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/mutex.h"
-#include "tsl/util/env_var.h"
 
 namespace tsl {
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism.h b/third_party/xla/xla/tsl/util/determinism.h
similarity index 86%
rename from third_party/xla/third_party/tsl/tsl/util/determinism.h
rename to third_party/xla/xla/tsl/util/determinism.h
index fff5b195845a39..2f1861ed60a23b 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism.h
+++ b/third_party/xla/xla/tsl/util/determinism.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_DETERMINISM_H_
-#define TENSORFLOW_TSL_UTIL_DETERMINISM_H_
+#ifndef XLA_TSL_UTIL_DETERMINISM_H_
+#define XLA_TSL_UTIL_DETERMINISM_H_
 
 namespace tsl {
 
@@ -24,4 +24,4 @@ void EnableOpDeterminism(bool enabled);
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DETERMINISM_H_
+#endif  // XLA_TSL_UTIL_DETERMINISM_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h b/third_party/xla/xla/tsl/util/determinism_test_util.h
similarity index 84%
rename from third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h
rename to third_party/xla/xla/tsl/util/determinism_test_util.h
index e458dc9cdacc50..34b4552bb62d6a 100644
--- a/third_party/xla/third_party/tsl/tsl/util/determinism_test_util.h
+++ b/third_party/xla/xla/tsl/util/determinism_test_util.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
-#define TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#ifndef XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#define XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
 
-#include "tsl/util/determinism.h"
+#include "xla/tsl/util/determinism.h"
 
 namespace tsl {
 namespace test {
@@ -35,4 +35,4 @@ class DeterministicOpsScope {
 }  // namespace test
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
+#endif  // XLA_TSL_UTIL_DETERMINISM_TEST_UTIL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc
rename to third_party/xla/xla/tsl/util/device_name_utils.cc
index 0920532c62eddb..180e3336666bca 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 #include <algorithm>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.h b/third_party/xla/xla/tsl/util/device_name_utils.h
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils.h
rename to third_party/xla/xla/tsl/util/device_name_utils.h
index 162af1c55b4b47..82b5fa3b1aec2e 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils.h
+++ b/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
-#define TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#ifndef XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#define XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
 
 #include <string>
 
@@ -291,4 +291,4 @@ std::ostream& operator<<(std::ostream& os,
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_DEVICE_NAME_UTILS_H_
+#endif  // XLA_TSL_UTIL_DEVICE_NAME_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc
rename to third_party/xla/xla/tsl/util/device_name_utils_test.cc
index dce1fc5807604f..03aa5fca5899b9 100644
--- a/third_party/xla/third_party/tsl/tsl/util/device_name_utils_test.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/device_name_utils.h"
+#include "xla/tsl/util/device_name_utils.h"
 
 #include <vector>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/env_var.cc b/third_party/xla/xla/tsl/util/env_var.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/env_var.cc
rename to third_party/xla/xla/tsl/util/env_var.cc
index e7d818445c7def..564617aa082889 100644
--- a/third_party/xla/third_party/tsl/tsl/util/env_var.cc
+++ b/third_party/xla/xla/tsl/util/env_var.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/env_var.h"
+#include "xla/tsl/util/env_var.h"
 
 #include <stdlib.h>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/env_var.h b/third_party/xla/xla/tsl/util/env_var.h
similarity index 95%
rename from third_party/xla/third_party/tsl/tsl/util/env_var.h
rename to third_party/xla/xla/tsl/util/env_var.h
index 9c6925c57f643b..69c0bff2a1658c 100644
--- a/third_party/xla/third_party/tsl/tsl/util/env_var.h
+++ b/third_party/xla/xla/tsl/util/env_var.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_ENV_VAR_H_
-#define TENSORFLOW_TSL_UTIL_ENV_VAR_H_
+#ifndef XLA_TSL_UTIL_ENV_VAR_H_
+#define XLA_TSL_UTIL_ENV_VAR_H_
 
 #include "tsl/platform/status.h"
 #include "tsl/platform/stringpiece.h"
@@ -53,4 +53,4 @@ Status ReadStringsFromEnvVar(StringPiece env_var_name, StringPiece default_val,
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_ENV_VAR_H_
+#endif  // XLA_TSL_UTIL_ENV_VAR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h b/third_party/xla/xla/tsl/util/onednn_threadpool.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
rename to third_party/xla/xla/tsl/util/onednn_threadpool.h
index 7d8a093ae89fa6..0c81806352f863 100644
--- a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
+++ b/third_party/xla/xla/tsl/util/onednn_threadpool.h
@@ -14,8 +14,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
-#define TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#ifndef XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#define XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
 #ifdef INTEL_MKL
 
 #include <list>
@@ -190,4 +190,4 @@ class OneDnnThreadPool {
 }  // namespace tsl
 
 #endif  // INTEL_MKL
-#endif  // TENSORFLOW_TSL_UTIL_ONEDNN_THREADPOOL_H_
+#endif  // XLA_TSL_UTIL_ONEDNN_THREADPOOL_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/proto/BUILD b/third_party/xla/xla/tsl/util/proto/BUILD
similarity index 100%
rename from third_party/xla/third_party/tsl/tsl/util/proto/BUILD
rename to third_party/xla/xla/tsl/util/proto/BUILD
diff --git a/third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h b/third_party/xla/xla/tsl/util/proto/proto_utils.h
similarity index 90%
rename from third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h
rename to third_party/xla/xla/tsl/util/proto/proto_utils.h
index 9a1dee8eed5224..2762f4df0e8af1 100644
--- a/third_party/xla/third_party/tsl/tsl/util/proto/proto_utils.h
+++ b/third_party/xla/xla/tsl/util/proto/proto_utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
-#define TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#ifndef XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#define XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
 
 #include "google/protobuf/duration.pb.h"
 #include "absl/time/time.h"
@@ -39,4 +39,4 @@ inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
 }  // namespace proto_utils
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_PROTO_PROTO_UTILS_H_
+#endif  // XLA_TSL_UTIL_PROTO_PROTO_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/reporter.cc b/third_party/xla/xla/tsl/util/reporter.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/reporter.cc
rename to third_party/xla/xla/tsl/util/reporter.cc
index 41501bc68e8ced..c8ee2f2f87c4ea 100644
--- a/third_party/xla/third_party/tsl/tsl/util/reporter.cc
+++ b/third_party/xla/xla/tsl/util/reporter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/reporter.h"
+#include "xla/tsl/util/reporter.h"
 
 #include "tsl/platform/errors.h"
 #include "tsl/platform/mutex.h"
diff --git a/third_party/xla/third_party/tsl/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
similarity index 97%
rename from third_party/xla/third_party/tsl/tsl/util/reporter.h
rename to third_party/xla/xla/tsl/util/reporter.h
index d020e94fae1276..cf1e2b2c274b25 100644
--- a/third_party/xla/third_party/tsl/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_REPORTER_H_
-#define TENSORFLOW_TSL_UTIL_REPORTER_H_
+#ifndef XLA_TSL_UTIL_REPORTER_H_
+#define XLA_TSL_UTIL_REPORTER_H_
 
 #include <cstdlib>
 #include <memory>
@@ -131,4 +131,4 @@ class TestReporter {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_REPORTER_H_
+#endif  // XLA_TSL_UTIL_REPORTER_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h b/third_party/xla/xla/tsl/util/stat_summarizer_options.h
similarity index 88%
rename from third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h
rename to third_party/xla/xla/tsl/util/stat_summarizer_options.h
index e07de6e8d5d9d1..c3ed6ffd7e48bf 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stat_summarizer_options.h
+++ b/third_party/xla/xla/tsl/util/stat_summarizer_options.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
-#define TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#ifndef XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#define XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
 namespace tsl {
 // Used to control the output of the statistics summarizer;
 struct StatSummarizerOptions {
@@ -41,4 +41,4 @@ struct StatSummarizerOptions {
 };
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#endif  // XLA_TSL_UTIL_STAT_SUMMARIZER_OPTIONS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc b/third_party/xla/xla/tsl/util/stats_calculator.cc
similarity index 99%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc
rename to third_party/xla/xla/tsl/util/stats_calculator.cc
index 99ab1e3e7c6bc5..cdfa46c94417c3 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/stats_calculator.h"
+#include "xla/tsl/util/stats_calculator.h"
 
 #include <iomanip>
 #include <map>
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.h b/third_party/xla/xla/tsl/util/stats_calculator.h
similarity index 96%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator.h
rename to third_party/xla/xla/tsl/util/stats_calculator.h
index 5c23f432971c23..84045fb6ceece2 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator.h
+++ b/third_party/xla/xla/tsl/util/stats_calculator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
-#define TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
+#ifndef XLA_TSL_UTIL_STATS_CALCULATOR_H_
+#define XLA_TSL_UTIL_STATS_CALCULATOR_H_
 
 #include <stdlib.h>
 
@@ -26,7 +26,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tsl/util/stat_summarizer_options.h"
+#include "xla/tsl/util/stat_summarizer_options.h"
 
 namespace tsl {
 
@@ -198,4 +198,4 @@ class StatsCalculator {
 
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_STATS_CALCULATOR_H_
+#endif  // XLA_TSL_UTIL_STATS_CALCULATOR_H_
diff --git a/third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc
rename to third_party/xla/xla/tsl/util/stats_calculator_test.cc
index 9093701e4478c9..d58186630598f0 100644
--- a/third_party/xla/third_party/tsl/tsl/util/stats_calculator_test.cc
+++ b/third_party/xla/xla/tsl/util/stats_calculator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/stats_calculator.h"
+#include "xla/tsl/util/stats_calculator.h"
 
 #include <cfloat>
 
diff --git a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc b/third_party/xla/xla/tsl/util/use_cudnn.cc
similarity index 98%
rename from third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc
rename to third_party/xla/xla/tsl/util/use_cudnn.cc
index 3156a319b73b3d..a3e1b4d25d2667 100644
--- a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.cc
+++ b/third_party/xla/xla/tsl/util/use_cudnn.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tsl/util/use_cudnn.h"
+#include "xla/tsl/util/use_cudnn.h"
 
 #include <cstdint>
 
+#include "xla/tsl/util/env_var.h"
 #include "tsl/platform/str_util.h"
 #include "tsl/platform/stringpiece.h"
-#include "tsl/util/env_var.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
diff --git a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.h b/third_party/xla/xla/tsl/util/use_cudnn.h
similarity index 92%
rename from third_party/xla/third_party/tsl/tsl/util/use_cudnn.h
rename to third_party/xla/xla/tsl/util/use_cudnn.h
index 738e727e4c7808..41c29b256f7be0 100644
--- a/third_party/xla/third_party/tsl/tsl/util/use_cudnn.h
+++ b/third_party/xla/xla/tsl/util/use_cudnn.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // The utility to check Cudnn dependency and set Cudnn-related flags.
 
-#ifndef TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
-#define TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
+#ifndef XLA_TSL_UTIL_USE_CUDNN_H_
+#define XLA_TSL_UTIL_USE_CUDNN_H_
 
 #include <cstdint>
 
@@ -40,4 +40,4 @@ bool ShouldCudnnGroupedConvolutionBeUsed(const int32_t filter_rows,
                                          const int32_t out_depth);
 }  // namespace tsl
 
-#endif  // TENSORFLOW_TSL_UTIL_USE_CUDNN_H_
+#endif  // XLA_TSL_UTIL_USE_CUDNN_H_
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 260fa5c1731310..71b67f8e74ce62 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -65,7 +65,7 @@ _XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
     Label("//xla/stream_executor/gpu:gpu_stream"),
     Label("//xla/stream_executor/rocm:all_runtime"),
     Label("//xla/stream_executor/rocm:stream_executor_rocm"),
-    "@local_tsl//tsl/util:determinism",
+    "//xla/tsl/util:determinism",
 ])
 
 def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 69a40c9dc09e41..1c84566bebb4aa 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -723,8 +723,10 @@ message DebugOptions {
 
   // Let GEMM fusion autotuning probe cuDNN as a backend.
   // Current levels:
-  // 0: disabled.
-  // 1: fusions of GEMM, elementwise, transpose/reshape operations.
+  // 0: Disabled.
+  // 1: Fusions of GEMM, elementwise, transpose/reshape operations.
+  // 2: + Broadcasts.
+  // 3: + Nontrivial noncontracting dimension reshapes/transposes.
   int32 xla_gpu_cudnn_gemm_fusion_level = 285;
 
   // Next id: 286